fix: only download nltk packages if necessary (#985)

* fix: only download nltk if necessary

* changelog and version
This commit is contained in:
Matt Robinson 2023-07-27 12:10:25 -04:00 committed by GitHub
parent 15618e8346
commit e7f2f1e3eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 12 deletions

View File

@ -1,4 +1,4 @@
## 0.8.5-dev1
## 0.8.5-dev2
### Enhancements
@ -9,6 +9,7 @@
### Fixes
* NLTK now only gets downloaded if necessary.
* Handling for empty tables in Word Documents and PowerPoints.
## 0.8.4

View File

@ -1 +1 @@
__version__ = "0.8.5-dev1" # pragma: no cover
__version__ = "0.8.5-dev2" # pragma: no cover

View File

@ -3,7 +3,7 @@ from functools import lru_cache
from typing import List, Tuple
if sys.version_info < (3, 8):
from typing_extensions import Final
from typing_extensions import Final # pragma: no cover
else:
from typing import Final
@ -23,30 +23,25 @@ def _download_nltk_package_if_not_present(package_name: str, package_category: s
nltk.download(package_name)
NLTK_PACKAGES = [
("tokenizers", "punkt"),
("taggers", "averaged_perceptron_tagger"),
]
for package_category, package_name in NLTK_PACKAGES:
_download_nltk_package_if_not_present(package_name, package_category)
@lru_cache(maxsize=CACHE_MAX_SIZE)
def sent_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
_download_nltk_package_if_not_present("tokenizers", "punkt")
return _sent_tokenize(text)
@lru_cache(maxsize=CACHE_MAX_SIZE)
def word_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
_download_nltk_package_if_not_present("tokenizers", "punkt")
return _word_tokenize(text)
@lru_cache(maxsize=CACHE_MAX_SIZE)
def pos_tag(text: str) -> List[Tuple[str, str]]:
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
_download_nltk_package_if_not_present("tokenizers", "punkt")
_download_nltk_package_if_not_present("taggers", "averaged_perceptron_tagger")
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.