Matt Robinson 7b25dfc337
fix(CVE-2024-39705): remove nltk download (#3361)
### Summary

Addresses
[CVE-2024-39705](https://nvd.nist.gov/vuln/detail/CVE-2024-39705), which
highlights the risk of remote code execution when running
`nltk.download` . Removes `nltk.download` in favor of a `.tgz` file with
the appropriate NLTK data files and checking the SHA256 hash to validate
the download. An error now raises if `nltk.download` is invoked.

The logic for determining the NLTK download directory is borrowed from
`nltk`, so users can still set `NLTK_DATA` as they did previously.

### Testing

1. Create a directory called `~/tmp/nltk_test`. Set
`NLTK_DATA=${HOME}/tmp/nltk_test`.
2. From a python interactive session, run:
```python
from unstructured.nlp.tokenize import download_nltk_packages

download_nltk_packages()
```
3. Run `ls /tmp/nltk_test/nltk_data`. You should see the downloaded
data.

---------

Co-authored-by: Steve Canny <stcanny@gmail.com>
2024-07-08 22:55:36 +00:00

84 lines
2.8 KiB
Python

from typing import List, Tuple
from unittest.mock import patch
import nltk
import pytest
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
from unstructured.nlp import tokenize
def test_error_raised_on_nltk_download():
with pytest.raises(ValueError):
tokenize.nltk.download("tokenizers/punkt")
def test_nltk_packages_download_if_not_present():
with patch.object(nltk, "find", side_effect=LookupError):
with patch.object(tokenize, "download_nltk_packages") as mock_download:
tokenize._download_nltk_packages_if_not_present()
mock_download.assert_called_once()
def test_nltk_packages_do_not_download_if():
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
tokenize._download_nltk_packages_if_not_present()
mock_download.assert_not_called()
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
pos_tags: List[Tuple[str, str]] = []
for token in tokens:
if token.lower() == "ask":
pos_tags.append((token, "VB"))
else:
pos_tags.append((token, ""))
return pos_tags
def test_pos_tag():
parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")
assert parts_of_speech == [
("ITEM", "NNP"),
("2A", "CD"),
(".", "."),
("PROPERTIES", "NN"),
]
def test_word_tokenize_caches(monkeypatch):
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
tokenize.word_tokenize.cache_clear()
assert tokenize.word_tokenize.cache_info().currsize == 0
tokenize.word_tokenize("Greetings! I am from outer space.")
assert tokenize.word_tokenize.cache_info().currsize == 1
def test_sent_tokenize_caches(monkeypatch):
monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
tokenize.sent_tokenize.cache_clear()
assert tokenize.sent_tokenize.cache_info().currsize == 0
tokenize.sent_tokenize("Greetings! I am from outer space.")
assert tokenize.sent_tokenize.cache_info().currsize == 1
def test_pos_tag_caches(monkeypatch):
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
tokenize.pos_tag.cache_clear()
assert tokenize.pos_tag.cache_info().currsize == 0
tokenize.pos_tag("Greetings! I am from outer space.")
assert tokenize.pos_tag.cache_info().currsize == 1
def test_tokenizers_functions_run():
sentence = "I am a big brown bear. What are you?"
tokenize.sent_tokenize(sentence)
tokenize.word_tokenize(sentence)
tokenize.pos_tag(sentence)