mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-12 08:37:51 +00:00
43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
from functools import lru_cache
|
|
from typing import List, Tuple
|
|
import sys
|
|
|
|
if sys.version_info < (3, 8):
|
|
from typing_extensions import Final
|
|
else:
|
|
from typing import Final
|
|
|
|
from nltk import (
|
|
pos_tag as _pos_tag,
|
|
sent_tokenize as _sent_tokenize,
|
|
word_tokenize as _word_tokenize,
|
|
)
|
|
|
|
CACHE_MAX_SIZE: Final[int] = 128
|
|
|
|
|
|
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
|
def sent_tokenize(text: str) -> List[str]:
|
|
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
|
|
return _sent_tokenize(text)
|
|
|
|
|
|
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
|
def word_tokenize(text: str) -> List[str]:
|
|
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
|
|
return _word_tokenize(text)
|
|
|
|
|
|
@lru_cache(maxsize=CACHE_MAX_SIZE)
|
|
def pos_tag(text: str) -> List[Tuple[str, str]]:
|
|
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
|
|
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
|
|
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
|
|
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
|
|
sentences = _sent_tokenize(text)
|
|
parts_of_speech = list()
|
|
for sentence in sentences:
|
|
tokens = _word_tokenize(sentence)
|
|
parts_of_speech.extend(_pos_tag(tokens))
|
|
return parts_of_speech
|