fix(CVE-2024-39705): update to latest nltk version (#3512)

### Summary Addresses [CVE-2024-39705](https://nvd.nist.gov/vuln/detail/CVE-2024-39705) by updating to `nltk==3.8.2` and closes #3511. This CVE had previously been mitigated in #3361. --------- Co-authored-by: Christine Straub <christinemstraub@gmail.com>
2025-12-24 05:34:58 +00:00 · 2024-08-13 09:39:29 -04:00 · 2024-08-13 09:39:29 -04:00 · 7437f0a084
commit 7437f0a084
parent 1158d8f695
32 changed files with 57 additions and 75 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -120,8 +120,6 @@ jobs:
      matrix:
        python-version: ["3.9","3.10","3.11", "3.12"]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup, lint]
    steps:
    - uses: actions/checkout@v4
@ -161,7 +159,6 @@ jobs:
        python-version: ["3.10"]
    runs-on: ubuntu-latest
    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
      UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
    needs: [setup, lint]
    steps:
@ -179,6 +176,7 @@ jobs:
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
      run: |
        source .venv/bin/activate
+        make install-nltk-models
        sudo apt-get update
        sudo apt-get install -y poppler-utils
        make install-pandoc install-test
@ -193,8 +191,6 @@ jobs:
      matrix:
        python-version: ["3.10"]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup, lint]
    steps:
    - uses: actions/checkout@v4
@ -211,6 +207,7 @@ jobs:
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
      run: |
        source .venv/bin/activate
+        make install-nltk-models
        make test-no-extras CI=true

  test_unit_dependency_extras:
@ -276,8 +273,6 @@ jobs:
      matrix:
        python-version: [ "3.9","3.10" ]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [ setup_ingest, lint ]
    steps:
      # actions/checkout MUST come before auth
@ -296,6 +291,7 @@ jobs:
      - name: Test Ingest (unit)
        run: |
          source .venv/bin/activate
+          make install-nltk-models
          PYTHONPATH=. pytest test_unstructured_ingest/unit


@ -304,8 +300,6 @@ jobs:
      matrix:
        python-version: ["3.9","3.10"]
    runs-on: ubuntu-latest-m
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup_ingest, lint]
    steps:
    # actions/checkout MUST come before auth
@ -373,6 +367,7 @@ jobs:
        CI: "true"
      run: |
        source .venv/bin/activate
+        make install-nltk-models
        sudo apt-get update
        sudo apt-get install -y libmagic-dev poppler-utils libreoffice
        make install-pandoc
@ -391,8 +386,6 @@ jobs:
      matrix:
        python-version: ["3.9","3.10"]
    runs-on: ubuntu-latest-m
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup_ingest, lint]
    steps:
    # actions/checkout MUST come before auth
@ -445,6 +438,7 @@ jobs:
        CI: "true"
      run: |
        source .venv/bin/activate
+        make install-nltk-models
        sudo apt-get update
        sudo apt-get install -y libmagic-dev poppler-utils libreoffice
        make install-pandoc
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.2-dev8
+## 0.15.2

 ### Enhancements

@ -10,6 +10,7 @@

 ### Fixes

+* **Updates NLTK data file for compatibility with `nltk>=3.8.2`**. The NLTK data file now container `punkt_tab`, making it possible to upgrade to `nltk>=3.8.2`. The `nltk==3.8.2` patches CVE-2024-39705.
 * **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
 * **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
 * **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
--- a/3
+++ b/3
@ -38,8 +38,7 @@ install-huggingface:

 .PHONY: install-nltk-models
 install-nltk-models:
-	python3 -c "import nltk; nltk.download('punkt')"
-	python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
+	python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"

 .PHONY: install-test
 install-test:
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -57,7 +57,7 @@ jsonpath-python==1.0.6
    # via unstructured-client
 langdetect==1.0.9
    # via -r ./base.in
-lxml==5.2.2
+lxml==5.3.0
    # via -r ./base.in
 marshmallow==3.21.3
    # via
@ -69,7 +69,7 @@ mypy-extensions==1.0.0
    #   unstructured-client
 nest-asyncio==1.6.0
    # via unstructured-client
-nltk==3.8.1
+nltk==3.8.2
    # via -r ./base.in
 numpy==1.26.4
    # via -r ./base.in
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -423,7 +423,7 @@ virtualenv==20.26.3
    # via pre-commit
 wcwidth==0.2.13
    # via prompt-toolkit
-webcolors==24.6.0
+webcolors==24.8.0
    # via jsonschema
 webencodings==0.5.1
    # via
@ -437,7 +437,7 @@ wheel==0.44.0
    #   pip-tools
 widgetsnbextension==4.0.11
    # via ipywidgets
-zipp==3.19.2
+zipp==3.20.0
    # via importlib-metadata

 # The following packages are considered to be unsafe in a requirements file:
--- a/requirements/extra-docx.txt
+++ b/requirements/extra-docx.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile ./extra-docx.in
 #
-lxml==5.2.2
+lxml==5.3.0
    # via
    #   -c ./base.txt
    #   python-docx
--- a/requirements/extra-markdown.txt
+++ b/requirements/extra-markdown.txt
@ -8,5 +8,5 @@ importlib-metadata==8.2.0
    # via markdown
 markdown==3.6
    # via -r ./extra-markdown.in
-zipp==3.19.2
+zipp==3.20.0
    # via importlib-metadata
--- a/requirements/extra-odt.txt
+++ b/requirements/extra-odt.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile ./extra-odt.in
 #
-lxml==5.2.2
+lxml==5.3.0
    # via
    #   -c ./base.txt
    #   python-docx
--- a/requirements/extra-paddleocr.txt
+++ b/requirements/extra-paddleocr.txt
@ -78,7 +78,7 @@ lanms-neo==1.0.2
    # via unstructured-paddleocr
 lazy-loader==0.4
    # via scikit-image
-lxml==5.2.2
+lxml==5.3.0
    # via
    #   -c ./base.txt
    #   premailer
@ -191,7 +191,7 @@ sniffio==1.3.1
    #   -c ./base.txt
    #   anyio
    #   httpx
-tifffile==2024.7.24
+tifffile==2024.8.10
    # via scikit-image
 tqdm==4.66.5
    # via
@ -208,5 +208,5 @@ urllib3==1.26.19
    #   -c ././deps/constraints.txt
    #   -c ./base.txt
    #   requests
-zipp==3.19.2
+zipp==3.20.0
    # via importlib-resources
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@ -86,7 +86,7 @@ kiwisolver==1.4.5
    # via matplotlib
 layoutparser==0.3.4
    # via unstructured-inference
-lxml==5.2.2
+lxml==5.3.0
    # via
    #   -c ./base.txt
    #   pikepdf
@ -249,7 +249,7 @@ six==1.16.0
    # via
    #   -c ./base.txt
    #   python-dateutil
-sympy==1.13.1
+sympy==1.13.2
    # via
    #   onnxruntime
    #   torch
@ -301,5 +301,5 @@ wrapt==1.16.0
    #   -c ././deps/constraints.txt
    #   -c ./base.txt
    #   deprecated
-zipp==3.19.2
+zipp==3.20.0
    # via importlib-resources
--- a/requirements/extra-pptx.txt
+++ b/requirements/extra-pptx.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile ./extra-pptx.in
 #
-lxml==5.2.2
+lxml==5.3.0
    # via python-pptx
 pillow==10.4.0
    # via python-pptx
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@ -85,7 +85,7 @@ six==1.16.0
    # via
    #   -c ./base.txt
    #   langdetect
-sympy==1.13.1
+sympy==1.13.2
    # via torch
 tokenizers==0.19.1
    # via
--- a/requirements/ingest/azure.txt
+++ b/requirements/ingest/azure.txt
@ -8,7 +8,7 @@ adlfs==2024.7.0
    # via -r ./ingest/azure.in
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via adlfs
 aiosignal==1.3.1
    # via aiohttp
--- a/requirements/ingest/chroma.txt
+++ b/requirements/ingest/chroma.txt
@ -194,7 +194,7 @@ sniffio==1.3.1
    #   httpx
 starlette==0.37.2
    # via fastapi
-sympy==1.13.1
+sympy==1.13.2
    # via onnxruntime
 tenacity==8.5.0
    # via
@ -247,7 +247,7 @@ wrapt==1.16.0
    #   -c ./ingest/../deps/constraints.txt
    #   deprecated
    #   opentelemetry-instrumentation
-zipp==3.19.2
+zipp==3.20.0
    # via
    #   importlib-metadata
    #   importlib-resources
--- a/requirements/ingest/clarifai.txt
+++ b/requirements/ingest/clarifai.txt
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
    #   requests
 clarifai==10.7.0
    # via -r ./ingest/clarifai.in
-clarifai-grpc==10.7.0
+clarifai-grpc==10.7.1
    # via clarifai
 contextlib2==21.6.0
    # via schema
--- a/requirements/ingest/discord.txt
+++ b/requirements/ingest/discord.txt
@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via discord-py
 aiosignal==1.3.1
    # via aiohttp
--- a/requirements/ingest/elasticsearch.txt
+++ b/requirements/ingest/elasticsearch.txt
@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via elasticsearch
 aiosignal==1.3.1
    # via aiohttp
@ -19,7 +19,7 @@ certifi==2024.7.4
    #   -c ./ingest/../base.txt
    #   -c ./ingest/../deps/constraints.txt
    #   elastic-transport
-elastic-transport==8.13.1
+elastic-transport==8.15.0
    # via elasticsearch
 elasticsearch[async]==8.14.0
    # via -r ./ingest/elasticsearch.in
--- a/requirements/ingest/embed-aws-bedrock.txt
+++ b/requirements/ingest/embed-aws-bedrock.txt
@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via
    #   langchain
    #   langchain-community
@ -70,7 +70,7 @@ langchain-core==0.2.29
    #   langchain-text-splitters
 langchain-text-splitters==0.2.2
    # via langchain
-langsmith==0.1.98
+langsmith==0.1.99
    # via
    #   langchain
    #   langchain-community
--- a/requirements/ingest/embed-huggingface.txt
+++ b/requirements/ingest/embed-huggingface.txt
@ -49,7 +49,7 @@ langchain-core==0.2.29
    # via langchain-huggingface
 langchain-huggingface==0.0.3
    # via -r ./ingest/embed-huggingface.in
-langsmith==0.1.98
+langsmith==0.1.99
    # via langchain-core
 markupsafe==2.1.5
    # via jinja2
@ -107,7 +107,7 @@ scipy==1.11.3
    #   sentence-transformers
 sentence-transformers==3.0.1
    # via langchain-huggingface
-sympy==1.13.1
+sympy==1.13.2
    # via torch
 tenacity==8.5.0
    # via langchain-core
--- a/requirements/ingest/embed-octoai.txt
+++ b/requirements/ingest/embed-octoai.txt
@ -49,7 +49,7 @@ idna==3.7
    #   requests
 jiter==0.5.0
    # via openai
-openai==1.40.2
+openai==1.40.3
    # via -r ./ingest/embed-octoai.in
 pydantic==2.8.2
    # via openai
--- a/requirements/ingest/embed-openai.txt
+++ b/requirements/ingest/embed-openai.txt
@ -55,11 +55,11 @@ jsonpointer==3.0.0
    # via jsonpatch
 langchain-core==0.2.29
    # via langchain-openai
-langchain-openai==0.1.20
+langchain-openai==0.1.21
    # via -r ./ingest/embed-openai.in
-langsmith==0.1.98
+langsmith==0.1.99
    # via langchain-core
-openai==1.40.2
+openai==1.40.3
    # via langchain-openai
 orjson==3.10.7
    # via langsmith
--- a/requirements/ingest/embed-vertexai.txt
+++ b/requirements/ingest/embed-vertexai.txt
@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via
    #   langchain
    #   langchain-community
@ -120,7 +120,7 @@ langchain-google-vertexai==1.0.8
    # via -r ./ingest/embed-vertexai.in
 langchain-text-splitters==0.2.2
    # via langchain
-langsmith==0.1.98
+langsmith==0.1.99
    # via
    #   langchain
    #   langchain-community
--- a/requirements/ingest/embed-voyageai.txt
+++ b/requirements/ingest/embed-voyageai.txt
@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via
    #   langchain
    #   voyageai
@ -55,7 +55,7 @@ langchain-text-splitters==0.2.2
    # via langchain
 langchain-voyageai==0.1.1
    # via -r ./ingest/embed-voyageai.in
-langsmith==0.1.98
+langsmith==0.1.99
    # via
    #   langchain
    #   langchain-core
--- a/requirements/ingest/gcs.txt
+++ b/requirements/ingest/gcs.txt
@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via gcsfs
 aiosignal==1.3.1
    # via aiohttp
--- a/requirements/ingest/s3.txt
+++ b/requirements/ingest/s3.txt
@ -4,11 +4,11 @@
 #
 #    pip-compile ./ingest/s3.in
 #
-aiobotocore==2.13.1
+aiobotocore==2.13.2
    # via s3fs
 aiohappyeyeballs==2.3.5
    # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.3
    # via
    #   aiobotocore
    #   s3fs
--- a/requirements/ingest/salesforce.txt
+++ b/requirements/ingest/salesforce.txt
@ -25,7 +25,7 @@ idna==3.7
    #   requests
 isodate==0.6.1
    # via zeep
-lxml==5.2.2
+lxml==5.3.0
    # via
    #   -c ./ingest/../base.txt
    #   zeep
--- a/requirements/ingest/sftp.txt
+++ b/requirements/ingest/sftp.txt
@ -16,7 +16,7 @@ fsspec==2024.5.0
    # via
    #   -c ./ingest/../deps/constraints.txt
    #   -r ./ingest/sftp.in
-paramiko==3.4.0
+paramiko==3.4.1
    # via -r ./ingest/sftp.in
 pycparser==2.22
    # via cffi
--- a/requirements/ingest/singlestore.txt
+++ b/requirements/ingest/singlestore.txt
@ -57,7 +57,7 @@ wheel==0.44.0
    # via
    #   -c ./ingest/../deps/constraints.txt
    #   singlestoredb
-zipp==3.19.2
+zipp==3.20.0
    # via importlib-metadata

 # The following packages are considered to be unsafe in a requirements file:
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -66,7 +66,7 @@ label-studio-tools==0.0.4
    # via label-studio-sdk
 liccheck==0.9.2
    # via -r ./test.in
-lxml==5.2.2
+lxml==5.3.0
    # via
    #   -c ./base.txt
    #   label-studio-sdk
--- a/test_unstructured/nlp/test_tokenize.py
+++ b/test_unstructured/nlp/test_tokenize.py
@ -2,17 +2,11 @@ from typing import List, Tuple
 from unittest.mock import patch

 import nltk
-import pytest

 from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 from unstructured.nlp import tokenize


-def test_error_raised_on_nltk_download():
-    with pytest.raises(ValueError):
-        tokenize.nltk.download("tokenizers/punkt")
-
-
 def test_nltk_packages_download_if_not_present():
    with patch.object(nltk, "find", side_effect=LookupError):
        with patch.object(tokenize, "download_nltk_packages") as mock_download:
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.2-dev8"  # pragma: no cover
+__version__ = "0.15.2"  # pragma: no cover
--- a/unstructured/nlp/tokenize.py
+++ b/unstructured/nlp/tokenize.py
@ -7,7 +7,7 @@ import tarfile
 import tempfile
 import urllib.request
 from functools import lru_cache
-from typing import Any, Final, List, Tuple
+from typing import Final, List, Tuple

 import nltk
 from nltk import pos_tag as _pos_tag
@ -16,15 +16,9 @@ from nltk import word_tokenize as _word_tokenize

 CACHE_MAX_SIZE: Final[int] = 128

-NLTK_DATA_URL = "https://utic-public-cf.s3.amazonaws.com/nltk_data.tgz"
-NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61"
-
-
-def _raise_on_nltk_download(*args: Any, **kwargs: Any):
-    raise ValueError("NLTK download disabled. See CVE-2024-39705")
-
-
-nltk.download = _raise_on_nltk_download
+NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
+NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
+NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"


 # NOTE(robinson) - mimic default dir logic from NLTK
@ -84,7 +78,7 @@ def download_nltk_packages():
        return sha256.hexdigest()

    with tempfile.TemporaryDirectory() as temp_dir_path:
-        tgz_file_path = os.path.join(temp_dir_path, "nltk_data.tgz")
+        tgz_file_path = os.path.join(temp_dir_path, NLTK_DATA_FILENAME)
        urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)

        file_hash = sha256_checksum(tgz_file_path)
@ -120,10 +114,10 @@ def _download_nltk_packages_if_not_present():

    tagger_available = check_for_nltk_package(
        package_category="taggers",
-        package_name="averaged_perceptron_tagger",
+        package_name="averaged_perceptron_tagger_eng",
    )
    tokenizer_available = check_for_nltk_package(
-        package_category="tokenizers", package_name="punkt"
+        package_category="tokenizers", package_name="punkt_tab"
    )

    if not (tokenizer_available and tagger_available):