mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00

### Summary Addresses [CVE-2024-39705](https://nvd.nist.gov/vuln/detail/CVE-2024-39705), which highlights the risk of remote code execution when running `nltk.download` . Removes `nltk.download` in favor of a `.tgz` file with the appropriate NLTK data files and checking the SHA256 hash to validate the download. An error now raises if `nltk.download` is invoked. The logic for determining the NLTK download directory is borrowed from `nltk`, so users can still set `NLTK_DATA` as they did previously. ### Testing 1. Create a directory called `~/tmp/nltk_test`. Set `NLTK_DATA=${HOME}/tmp/nltk_test`. 2. From a python interactive session, run: ```python from unstructured.nlp.tokenize import download_nltk_packages download_nltk_packages() ``` 3. Run `ls /tmp/nltk_test/nltk_data`. You should see the downloaded data. --------- Co-authored-by: Steve Canny <stcanny@gmail.com>
84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
from typing import List, Tuple
|
|
from unittest.mock import patch
|
|
|
|
import nltk
|
|
import pytest
|
|
|
|
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
|
from unstructured.nlp import tokenize
|
|
|
|
|
|
def test_error_raised_on_nltk_download():
|
|
with pytest.raises(ValueError):
|
|
tokenize.nltk.download("tokenizers/punkt")
|
|
|
|
|
|
def test_nltk_packages_download_if_not_present():
|
|
with patch.object(nltk, "find", side_effect=LookupError):
|
|
with patch.object(tokenize, "download_nltk_packages") as mock_download:
|
|
tokenize._download_nltk_packages_if_not_present()
|
|
|
|
mock_download.assert_called_once()
|
|
|
|
|
|
def test_nltk_packages_do_not_download_if():
|
|
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
|
|
tokenize._download_nltk_packages_if_not_present()
|
|
|
|
mock_download.assert_not_called()
|
|
|
|
|
|
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
|
|
pos_tags: List[Tuple[str, str]] = []
|
|
for token in tokens:
|
|
if token.lower() == "ask":
|
|
pos_tags.append((token, "VB"))
|
|
else:
|
|
pos_tags.append((token, ""))
|
|
return pos_tags
|
|
|
|
|
|
def test_pos_tag():
|
|
parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")
|
|
assert parts_of_speech == [
|
|
("ITEM", "NNP"),
|
|
("2A", "CD"),
|
|
(".", "."),
|
|
("PROPERTIES", "NN"),
|
|
]
|
|
|
|
|
|
def test_word_tokenize_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.word_tokenize.cache_clear()
|
|
assert tokenize.word_tokenize.cache_info().currsize == 0
|
|
tokenize.word_tokenize("Greetings! I am from outer space.")
|
|
assert tokenize.word_tokenize.cache_info().currsize == 1
|
|
|
|
|
|
def test_sent_tokenize_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.sent_tokenize.cache_clear()
|
|
assert tokenize.sent_tokenize.cache_info().currsize == 0
|
|
tokenize.sent_tokenize("Greetings! I am from outer space.")
|
|
assert tokenize.sent_tokenize.cache_info().currsize == 1
|
|
|
|
|
|
def test_pos_tag_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.pos_tag.cache_clear()
|
|
assert tokenize.pos_tag.cache_info().currsize == 0
|
|
tokenize.pos_tag("Greetings! I am from outer space.")
|
|
assert tokenize.pos_tag.cache_info().currsize == 1
|
|
|
|
|
|
def test_tokenizers_functions_run():
|
|
sentence = "I am a big brown bear. What are you?"
|
|
tokenize.sent_tokenize(sentence)
|
|
tokenize.word_tokenize(sentence)
|
|
tokenize.pos_tag(sentence)
|