mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 12:21:30 +00:00
fix: only download nltk packages if necessary (#985)
* fix: only download nltk if necessary * changelog and version
This commit is contained in:
parent
15618e8346
commit
e7f2f1e3eb
@ -1,4 +1,4 @@
|
||||
## 0.8.5-dev1
|
||||
## 0.8.5-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* NLTK now only gets downloaded if necessary.
|
||||
* Handling for empty tables in Word Documents and PowerPoints.
|
||||
|
||||
## 0.8.4
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.5-dev1" # pragma: no cover
|
||||
__version__ = "0.8.5-dev2" # pragma: no cover
|
||||
|
||||
@ -3,7 +3,7 @@ from functools import lru_cache
|
||||
from typing import List, Tuple
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
from typing_extensions import Final # pragma: no cover
|
||||
else:
|
||||
from typing import Final
|
||||
|
||||
@ -23,30 +23,25 @@ def _download_nltk_package_if_not_present(package_name: str, package_category: s
|
||||
nltk.download(package_name)
|
||||
|
||||
|
||||
NLTK_PACKAGES = [
|
||||
("tokenizers", "punkt"),
|
||||
("taggers", "averaged_perceptron_tagger"),
|
||||
]
|
||||
|
||||
for package_category, package_name in NLTK_PACKAGES:
|
||||
_download_nltk_package_if_not_present(package_name, package_category)
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
||||
def sent_tokenize(text: str) -> List[str]:
|
||||
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
|
||||
_download_nltk_package_if_not_present("tokenizers", "punkt")
|
||||
return _sent_tokenize(text)
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
||||
def word_tokenize(text: str) -> List[str]:
|
||||
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
|
||||
_download_nltk_package_if_not_present("tokenizers", "punkt")
|
||||
return _word_tokenize(text)
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
||||
def pos_tag(text: str) -> List[Tuple[str, str]]:
|
||||
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
|
||||
_download_nltk_package_if_not_present("tokenizers", "punkt")
|
||||
_download_nltk_package_if_not_present("taggers", "averaged_perceptron_tagger")
|
||||
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
|
||||
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
|
||||
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user