mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
fix: correct nltk download arg order (#991)
* fix: correct download order to nltk args * add smoke test for tokenizers
This commit is contained in:
parent
84db9c4937
commit
e017e99b5b
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.8.7
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fix argument order in NLTK download step
|
||||
|
||||
## 0.8.6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -68,3 +68,10 @@ def test_pos_tag_caches(monkeypatch):
|
||||
assert tokenize.pos_tag.cache_info().currsize == 0
|
||||
tokenize.pos_tag("Greetings! I am from outer space.")
|
||||
assert tokenize.pos_tag.cache_info().currsize == 1
|
||||
|
||||
|
||||
def test_tokenizers_functions_run():
|
||||
sentence = "I am a big brown bear. What are you?"
|
||||
tokenize.sent_tokenize(sentence)
|
||||
tokenize.word_tokenize(sentence)
|
||||
tokenize.pos_tag(sentence)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.6" # pragma: no cover
|
||||
__version__ = "0.8.7" # pragma: no cover
|
||||
|
||||
@ -26,22 +26,25 @@ def _download_nltk_package_if_not_present(package_name: str, package_category: s
|
||||
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
||||
def sent_tokenize(text: str) -> List[str]:
|
||||
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
|
||||
_download_nltk_package_if_not_present("tokenizers", "punkt")
|
||||
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
|
||||
return _sent_tokenize(text)
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
||||
def word_tokenize(text: str) -> List[str]:
|
||||
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
|
||||
_download_nltk_package_if_not_present("tokenizers", "punkt")
|
||||
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
|
||||
return _word_tokenize(text)
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
||||
def pos_tag(text: str) -> List[Tuple[str, str]]:
|
||||
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
|
||||
_download_nltk_package_if_not_present("tokenizers", "punkt")
|
||||
_download_nltk_package_if_not_present("taggers", "averaged_perceptron_tagger")
|
||||
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
|
||||
_download_nltk_package_if_not_present(
|
||||
package_category="taggers",
|
||||
package_name="averaged_perceptron_tagger",
|
||||
)
|
||||
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
|
||||
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
|
||||
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user