mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 15:42:16 +00:00

* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
from typing import List, Tuple
|
|
from unittest.mock import patch
|
|
|
|
import nltk
|
|
|
|
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
|
from unstructured.nlp import tokenize
|
|
|
|
|
|
def test_nltk_packages_download_if_not_present():
|
|
with patch.object(nltk, "find", side_effect=LookupError):
|
|
with patch.object(nltk, "download") as mock_download:
|
|
tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")
|
|
|
|
mock_download.assert_called_with("fake_package")
|
|
|
|
|
|
def test_nltk_packages_do_not_download_if():
|
|
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
|
|
tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")
|
|
|
|
mock_download.assert_not_called()
|
|
|
|
|
|
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
|
|
pos_tags: List[Tuple[str, str]] = []
|
|
for token in tokens:
|
|
if token.lower() == "ask":
|
|
pos_tags.append((token, "VB"))
|
|
else:
|
|
pos_tags.append((token, ""))
|
|
return pos_tags
|
|
|
|
|
|
def test_pos_tag():
|
|
parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")
|
|
assert parts_of_speech == [
|
|
("ITEM", "NNP"),
|
|
("2A", "CD"),
|
|
(".", "."),
|
|
("PROPERTIES", "NN"),
|
|
]
|
|
|
|
|
|
def test_word_tokenize_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.word_tokenize.cache_clear()
|
|
assert tokenize.word_tokenize.cache_info().currsize == 0
|
|
tokenize.word_tokenize("Greetings! I am from outer space.")
|
|
assert tokenize.word_tokenize.cache_info().currsize == 1
|
|
|
|
|
|
def test_sent_tokenize_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.sent_tokenize.cache_clear()
|
|
assert tokenize.sent_tokenize.cache_info().currsize == 0
|
|
tokenize.sent_tokenize("Greetings! I am from outer space.")
|
|
assert tokenize.sent_tokenize.cache_info().currsize == 1
|
|
|
|
|
|
def test_pos_tag_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.pos_tag.cache_clear()
|
|
assert tokenize.pos_tag.cache_info().currsize == 0
|
|
tokenize.pos_tag("Greetings! I am from outer space.")
|
|
assert tokenize.pos_tag.cache_info().currsize == 1
|