unstructured/test_unstructured/nlp/test_tokenize.py

from typing import List, Tuple
from unittest.mock import patch

import nltk

from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
from unstructured.nlp import tokenize


def test_nltk_packages_download_if_not_present():
    with patch.object(nltk, "find", side_effect=LookupError):
        with patch.object(nltk, "download") as mock_download:
            tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")

    mock_download.assert_called_with("fake_package")


def test_nltk_packages_do_not_download_if():
    with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
        tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")

    mock_download.assert_not_called()


def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
    pos_tags: List[Tuple[str, str]] = []
    for token in tokens:
        if token.lower() == "ask":
            pos_tags.append((token, "VB"))
        else:
            pos_tags.append((token, ""))
    return pos_tags


def test_pos_tag():
    parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")
    assert parts_of_speech == [
        ("ITEM", "NNP"),
        ("2A", "CD"),
        (".", "."),
        ("PROPERTIES", "NN"),
    ]


def test_word_tokenize_caches(monkeypatch):
    monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
    tokenize.word_tokenize.cache_clear()
    assert tokenize.word_tokenize.cache_info().currsize == 0
    tokenize.word_tokenize("Greetings! I am from outer space.")
    assert tokenize.word_tokenize.cache_info().currsize == 1


def test_sent_tokenize_caches(monkeypatch):
    monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)
    monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
    tokenize.sent_tokenize.cache_clear()
    assert tokenize.sent_tokenize.cache_info().currsize == 0
    tokenize.sent_tokenize("Greetings! I am from outer space.")
    assert tokenize.sent_tokenize.cache_info().currsize == 1


def test_pos_tag_caches(monkeypatch):
    monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
    tokenize.pos_tag.cache_clear()
    assert tokenize.pos_tag.cache_info().currsize == 0
    tokenize.pos_tag("Greetings! I am from outer space.")
    assert tokenize.pos_tag.cache_info().currsize == 1


def test_tokenizers_functions_run():
    sentence = "I am a big brown bear. What are you?"
    tokenize.sent_tokenize(sentence)
    tokenize.word_tokenize(sentence)
    tokenize.pos_tag(sentence)
Initial Release 2022-06-29 14:35:19 -04:00			`from typing import List, Tuple`
build(deps): automatically download `nltk` models when required (#246) * code for downloading nltk packages * don't run nltk make command in ci * test for model downloads * remove nltk install from docs * update changelog and bump version 2023-02-23 12:19:13 -05:00			`from unittest.mock import patch`

			`import nltk`
Initial Release 2022-06-29 14:35:19 -04:00
chore: Reorganize partition bricks under partition directory (#76) * move partition_pdf to partition folder * move partition.py * refactor partioning bricks into partition diretory * import to nlp for backward compatibility * update docs * update version and bump changelog * fix typo in changelog * update readme reference 2022-11-21 17:27:23 -05:00			`from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`from unstructured.nlp import tokenize`
Initial Release 2022-06-29 14:35:19 -04:00

build(deps): automatically download `nltk` models when required (#246) * code for downloading nltk packages * don't run nltk make command in ci * test for model downloads * remove nltk install from docs * update changelog and bump version 2023-02-23 12:19:13 -05:00			`def test_nltk_packages_download_if_not_present():`
			`with patch.object(nltk, "find", side_effect=LookupError):`
			`with patch.object(nltk, "download") as mock_download:`
			`tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")`

			`mock_download.assert_called_with("fake_package")`


			`def test_nltk_packages_do_not_download_if():`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:`
			`tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")`
build(deps): automatically download `nltk` models when required (#246) * code for downloading nltk packages * don't run nltk make command in ci * test for model downloads * remove nltk install from docs * update changelog and bump version 2023-02-23 12:19:13 -05:00
			`mock_download.assert_not_called()`


Initial Release 2022-06-29 14:35:19 -04:00			`def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`pos_tags: List[Tuple[str, str]] = []`
Initial Release 2022-06-29 14:35:19 -04:00			`for token in tokens:`
			`if token.lower() == "ask":`
			`pos_tags.append((token, "VB"))`
			`else:`
			`pos_tags.append((token, ""))`
			`return pos_tags`


			`def test_pos_tag():`
			`parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")`
			`assert parts_of_speech == [`
			`("ITEM", "NNP"),`
			`("2A", "CD"),`
			`(".", "."),`
			`("PROPERTIES", "NN"),`
			`]`


			`def test_word_tokenize_caches(monkeypatch):`
			`monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)`
			`monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)`
			`tokenize.word_tokenize.cache_clear()`
			`assert tokenize.word_tokenize.cache_info().currsize == 0`
			`tokenize.word_tokenize("Greetings! I am from outer space.")`
			`assert tokenize.word_tokenize.cache_info().currsize == 1`


			`def test_sent_tokenize_caches(monkeypatch):`
			`monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)`
			`monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)`
			`monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)`
			`tokenize.sent_tokenize.cache_clear()`
			`assert tokenize.sent_tokenize.cache_info().currsize == 0`
			`tokenize.sent_tokenize("Greetings! I am from outer space.")`
			`assert tokenize.sent_tokenize.cache_info().currsize == 1`


			`def test_pos_tag_caches(monkeypatch):`
			`monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)`
			`monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)`
			`tokenize.pos_tag.cache_clear()`
			`assert tokenize.pos_tag.cache_info().currsize == 0`
			`tokenize.pos_tag("Greetings! I am from outer space.")`
			`assert tokenize.pos_tag.cache_info().currsize == 1`
fix: correct nltk download arg order (#991) * fix: correct download order to nltk args * add smoke test for tokenizers 2023-07-28 11:29:59 -04:00

			`def test_tokenizers_functions_run():`
			`sentence = "I am a big brown bear. What are you?"`
			`tokenize.sent_tokenize(sentence)`
			`tokenize.word_tokenize(sentence)`
			`tokenize.pos_tag(sentence)`