fix: correct nltk download arg order (#991)

* fix: correct download order to nltk args

* add smoke test for tokenizers
This commit is contained in:
Matt Robinson 2023-07-28 11:29:59 -04:00 committed by GitHub
parent 84db9c4937
commit e017e99b5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 25 additions and 5 deletions

View File

@ -1,3 +1,13 @@
## 0.8.7
### Enhancements
### Features
### Fixes
* Fix argument order in NLTK download step
## 0.8.6
### Enhancements

View File

@ -68,3 +68,10 @@ def test_pos_tag_caches(monkeypatch):
assert tokenize.pos_tag.cache_info().currsize == 0
tokenize.pos_tag("Greetings! I am from outer space.")
assert tokenize.pos_tag.cache_info().currsize == 1
def test_tokenizers_functions_run():
sentence = "I am a big brown bear. What are you?"
tokenize.sent_tokenize(sentence)
tokenize.word_tokenize(sentence)
tokenize.pos_tag(sentence)

View File

@ -1 +1 @@
__version__ = "0.8.6" # pragma: no cover
__version__ = "0.8.7" # pragma: no cover

View File

@ -26,22 +26,25 @@ def _download_nltk_package_if_not_present(package_name: str, package_category: s
@lru_cache(maxsize=CACHE_MAX_SIZE)
def sent_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
_download_nltk_package_if_not_present("tokenizers", "punkt")
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
return _sent_tokenize(text)
@lru_cache(maxsize=CACHE_MAX_SIZE)
def word_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
_download_nltk_package_if_not_present("tokenizers", "punkt")
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
return _word_tokenize(text)
@lru_cache(maxsize=CACHE_MAX_SIZE)
def pos_tag(text: str) -> List[Tuple[str, str]]:
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
_download_nltk_package_if_not_present("tokenizers", "punkt")
_download_nltk_package_if_not_present("taggers", "averaged_perceptron_tagger")
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
_download_nltk_package_if_not_present(
package_category="taggers",
package_name="averaged_perceptron_tagger",
)
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.