fix(CVE-2024-39705): update to latest nltk version (#3512)

### Summary

Addresses
[CVE-2024-39705](https://nvd.nist.gov/vuln/detail/CVE-2024-39705) by
updating to `nltk==3.8.2` and closes #3511. This CVE had previously been
mitigated in #3361.

---------

Co-authored-by: Christine Straub <christinemstraub@gmail.com>
This commit is contained in:
Matt Robinson 2024-08-13 09:39:29 -04:00 committed by GitHub
parent 1158d8f695
commit 7437f0a084
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 57 additions and 75 deletions

View File

@ -120,8 +120,6 @@ jobs:
matrix:
python-version: ["3.9","3.10","3.11", "3.12"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
@ -161,7 +159,6 @@ jobs:
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
needs: [setup, lint]
steps:
@ -179,6 +176,7 @@ jobs:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv/bin/activate
make install-nltk-models
sudo apt-get update
sudo apt-get install -y poppler-utils
make install-pandoc install-test
@ -193,8 +191,6 @@ jobs:
matrix:
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
@ -211,6 +207,7 @@ jobs:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv/bin/activate
make install-nltk-models
make test-no-extras CI=true
test_unit_dependency_extras:
@ -276,8 +273,6 @@ jobs:
matrix:
python-version: [ "3.9","3.10" ]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [ setup_ingest, lint ]
steps:
# actions/checkout MUST come before auth
@ -296,6 +291,7 @@ jobs:
- name: Test Ingest (unit)
run: |
source .venv/bin/activate
make install-nltk-models
PYTHONPATH=. pytest test_unstructured_ingest/unit
@ -304,8 +300,6 @@ jobs:
matrix:
python-version: ["3.9","3.10"]
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup_ingest, lint]
steps:
# actions/checkout MUST come before auth
@ -373,6 +367,7 @@ jobs:
CI: "true"
run: |
source .venv/bin/activate
make install-nltk-models
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
@ -391,8 +386,6 @@ jobs:
matrix:
python-version: ["3.9","3.10"]
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup_ingest, lint]
steps:
# actions/checkout MUST come before auth
@ -445,6 +438,7 @@ jobs:
CI: "true"
run: |
source .venv/bin/activate
make install-nltk-models
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc

View File

@ -1,4 +1,4 @@
## 0.15.2-dev8
## 0.15.2
### Enhancements
@ -10,6 +10,7 @@
### Fixes
* **Updates NLTK data file for compatibility with `nltk>=3.8.2`**. The NLTK data file now container `punkt_tab`, making it possible to upgrade to `nltk>=3.8.2`. The `nltk==3.8.2` patches CVE-2024-39705.
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.

View File

@ -38,8 +38,7 @@ install-huggingface:
.PHONY: install-nltk-models
install-nltk-models:
python3 -c "import nltk; nltk.download('punkt')"
python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
.PHONY: install-test
install-test:

View File

@ -57,7 +57,7 @@ jsonpath-python==1.0.6
# via unstructured-client
langdetect==1.0.9
# via -r ./base.in
lxml==5.2.2
lxml==5.3.0
# via -r ./base.in
marshmallow==3.21.3
# via
@ -69,7 +69,7 @@ mypy-extensions==1.0.0
# unstructured-client
nest-asyncio==1.6.0
# via unstructured-client
nltk==3.8.1
nltk==3.8.2
# via -r ./base.in
numpy==1.26.4
# via -r ./base.in

View File

@ -423,7 +423,7 @@ virtualenv==20.26.3
# via pre-commit
wcwidth==0.2.13
# via prompt-toolkit
webcolors==24.6.0
webcolors==24.8.0
# via jsonschema
webencodings==0.5.1
# via
@ -437,7 +437,7 @@ wheel==0.44.0
# pip-tools
widgetsnbextension==4.0.11
# via ipywidgets
zipp==3.19.2
zipp==3.20.0
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:

View File

@ -4,7 +4,7 @@
#
# pip-compile ./extra-docx.in
#
lxml==5.2.2
lxml==5.3.0
# via
# -c ./base.txt
# python-docx

View File

@ -8,5 +8,5 @@ importlib-metadata==8.2.0
# via markdown
markdown==3.6
# via -r ./extra-markdown.in
zipp==3.19.2
zipp==3.20.0
# via importlib-metadata

View File

@ -4,7 +4,7 @@
#
# pip-compile ./extra-odt.in
#
lxml==5.2.2
lxml==5.3.0
# via
# -c ./base.txt
# python-docx

View File

@ -78,7 +78,7 @@ lanms-neo==1.0.2
# via unstructured-paddleocr
lazy-loader==0.4
# via scikit-image
lxml==5.2.2
lxml==5.3.0
# via
# -c ./base.txt
# premailer
@ -191,7 +191,7 @@ sniffio==1.3.1
# -c ./base.txt
# anyio
# httpx
tifffile==2024.7.24
tifffile==2024.8.10
# via scikit-image
tqdm==4.66.5
# via
@ -208,5 +208,5 @@ urllib3==1.26.19
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
zipp==3.19.2
zipp==3.20.0
# via importlib-resources

View File

@ -86,7 +86,7 @@ kiwisolver==1.4.5
# via matplotlib
layoutparser==0.3.4
# via unstructured-inference
lxml==5.2.2
lxml==5.3.0
# via
# -c ./base.txt
# pikepdf
@ -249,7 +249,7 @@ six==1.16.0
# via
# -c ./base.txt
# python-dateutil
sympy==1.13.1
sympy==1.13.2
# via
# onnxruntime
# torch
@ -301,5 +301,5 @@ wrapt==1.16.0
# -c ././deps/constraints.txt
# -c ./base.txt
# deprecated
zipp==3.19.2
zipp==3.20.0
# via importlib-resources

View File

@ -4,7 +4,7 @@
#
# pip-compile ./extra-pptx.in
#
lxml==5.2.2
lxml==5.3.0
# via python-pptx
pillow==10.4.0
# via python-pptx

View File

@ -85,7 +85,7 @@ six==1.16.0
# via
# -c ./base.txt
# langdetect
sympy==1.13.1
sympy==1.13.2
# via torch
tokenizers==0.19.1
# via

View File

@ -8,7 +8,7 @@ adlfs==2024.7.0
# via -r ./ingest/azure.in
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via adlfs
aiosignal==1.3.1
# via aiohttp

View File

@ -194,7 +194,7 @@ sniffio==1.3.1
# httpx
starlette==0.37.2
# via fastapi
sympy==1.13.1
sympy==1.13.2
# via onnxruntime
tenacity==8.5.0
# via
@ -247,7 +247,7 @@ wrapt==1.16.0
# -c ./ingest/../deps/constraints.txt
# deprecated
# opentelemetry-instrumentation
zipp==3.19.2
zipp==3.20.0
# via
# importlib-metadata
# importlib-resources

View File

@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# requests
clarifai==10.7.0
# via -r ./ingest/clarifai.in
clarifai-grpc==10.7.0
clarifai-grpc==10.7.1
# via clarifai
contextlib2==21.6.0
# via schema

View File

@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via discord-py
aiosignal==1.3.1
# via aiohttp

View File

@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via elasticsearch
aiosignal==1.3.1
# via aiohttp
@ -19,7 +19,7 @@ certifi==2024.7.4
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# elastic-transport
elastic-transport==8.13.1
elastic-transport==8.15.0
# via elasticsearch
elasticsearch[async]==8.14.0
# via -r ./ingest/elasticsearch.in

View File

@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via
# langchain
# langchain-community
@ -70,7 +70,7 @@ langchain-core==0.2.29
# langchain-text-splitters
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.98
langsmith==0.1.99
# via
# langchain
# langchain-community

View File

@ -49,7 +49,7 @@ langchain-core==0.2.29
# via langchain-huggingface
langchain-huggingface==0.0.3
# via -r ./ingest/embed-huggingface.in
langsmith==0.1.98
langsmith==0.1.99
# via langchain-core
markupsafe==2.1.5
# via jinja2
@ -107,7 +107,7 @@ scipy==1.11.3
# sentence-transformers
sentence-transformers==3.0.1
# via langchain-huggingface
sympy==1.13.1
sympy==1.13.2
# via torch
tenacity==8.5.0
# via langchain-core

View File

@ -49,7 +49,7 @@ idna==3.7
# requests
jiter==0.5.0
# via openai
openai==1.40.2
openai==1.40.3
# via -r ./ingest/embed-octoai.in
pydantic==2.8.2
# via openai

View File

@ -55,11 +55,11 @@ jsonpointer==3.0.0
# via jsonpatch
langchain-core==0.2.29
# via langchain-openai
langchain-openai==0.1.20
langchain-openai==0.1.21
# via -r ./ingest/embed-openai.in
langsmith==0.1.98
langsmith==0.1.99
# via langchain-core
openai==1.40.2
openai==1.40.3
# via langchain-openai
orjson==3.10.7
# via langsmith

View File

@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via
# langchain
# langchain-community
@ -120,7 +120,7 @@ langchain-google-vertexai==1.0.8
# via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.98
langsmith==0.1.99
# via
# langchain
# langchain-community

View File

@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via
# langchain
# voyageai
@ -55,7 +55,7 @@ langchain-text-splitters==0.2.2
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
langsmith==0.1.98
langsmith==0.1.99
# via
# langchain
# langchain-core

View File

@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via gcsfs
aiosignal==1.3.1
# via aiohttp

View File

@ -4,11 +4,11 @@
#
# pip-compile ./ingest/s3.in
#
aiobotocore==2.13.1
aiobotocore==2.13.2
# via s3fs
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.3
# via
# aiobotocore
# s3fs

View File

@ -25,7 +25,7 @@ idna==3.7
# requests
isodate==0.6.1
# via zeep
lxml==5.2.2
lxml==5.3.0
# via
# -c ./ingest/../base.txt
# zeep

View File

@ -16,7 +16,7 @@ fsspec==2024.5.0
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/sftp.in
paramiko==3.4.0
paramiko==3.4.1
# via -r ./ingest/sftp.in
pycparser==2.22
# via cffi

View File

@ -57,7 +57,7 @@ wheel==0.44.0
# via
# -c ./ingest/../deps/constraints.txt
# singlestoredb
zipp==3.19.2
zipp==3.20.0
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:

View File

@ -66,7 +66,7 @@ label-studio-tools==0.0.4
# via label-studio-sdk
liccheck==0.9.2
# via -r ./test.in
lxml==5.2.2
lxml==5.3.0
# via
# -c ./base.txt
# label-studio-sdk

View File

@ -2,17 +2,11 @@ from typing import List, Tuple
from unittest.mock import patch
import nltk
import pytest
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
from unstructured.nlp import tokenize
def test_error_raised_on_nltk_download():
with pytest.raises(ValueError):
tokenize.nltk.download("tokenizers/punkt")
def test_nltk_packages_download_if_not_present():
with patch.object(nltk, "find", side_effect=LookupError):
with patch.object(tokenize, "download_nltk_packages") as mock_download:

View File

@ -1 +1 @@
__version__ = "0.15.2-dev8" # pragma: no cover
__version__ = "0.15.2" # pragma: no cover

View File

@ -7,7 +7,7 @@ import tarfile
import tempfile
import urllib.request
from functools import lru_cache
from typing import Any, Final, List, Tuple
from typing import Final, List, Tuple
import nltk
from nltk import pos_tag as _pos_tag
@ -16,15 +16,9 @@ from nltk import word_tokenize as _word_tokenize
CACHE_MAX_SIZE: Final[int] = 128
NLTK_DATA_URL = "https://utic-public-cf.s3.amazonaws.com/nltk_data.tgz"
NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61"
def _raise_on_nltk_download(*args: Any, **kwargs: Any):
raise ValueError("NLTK download disabled. See CVE-2024-39705")
nltk.download = _raise_on_nltk_download
NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
# NOTE(robinson) - mimic default dir logic from NLTK
@ -84,7 +78,7 @@ def download_nltk_packages():
return sha256.hexdigest()
with tempfile.TemporaryDirectory() as temp_dir_path:
tgz_file_path = os.path.join(temp_dir_path, "nltk_data.tgz")
tgz_file_path = os.path.join(temp_dir_path, NLTK_DATA_FILENAME)
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
file_hash = sha256_checksum(tgz_file_path)
@ -120,10 +114,10 @@ def _download_nltk_packages_if_not_present():
tagger_available = check_for_nltk_package(
package_category="taggers",
package_name="averaged_perceptron_tagger",
package_name="averaged_perceptron_tagger_eng",
)
tokenizer_available = check_for_nltk_package(
package_category="tokenizers", package_name="punkt"
package_category="tokenizers", package_name="punkt_tab"
)
if not (tokenizer_available and tagger_available):