Matt Robinson 08e091c5a9
chore: Reorganize partition bricks under partition directory (#76)
* move partition_pdf to partition folder

* move partition.py

* refactor partioning bricks into partition diretory

* import to nlp for backward compatibility

* update docs

* update version and bump changelog

* fix typo in changelog

* update readme reference
2022-11-21 22:27:23 +00:00

54 lines
1.9 KiB
Python

from typing import List, Tuple
import unstructured.nlp.tokenize as tokenize
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
pos_tags: List[Tuple[str, str]] = list()
for token in tokens:
if token.lower() == "ask":
pos_tags.append((token, "VB"))
else:
pos_tags.append((token, ""))
return pos_tags
def test_pos_tag():
parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")
assert parts_of_speech == [
("ITEM", "NNP"),
("2A", "CD"),
(".", "."),
("PROPERTIES", "NN"),
]
def test_word_tokenize_caches(monkeypatch):
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
tokenize.word_tokenize.cache_clear()
assert tokenize.word_tokenize.cache_info().currsize == 0
tokenize.word_tokenize("Greetings! I am from outer space.")
assert tokenize.word_tokenize.cache_info().currsize == 1
def test_sent_tokenize_caches(monkeypatch):
monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
tokenize.sent_tokenize.cache_clear()
assert tokenize.sent_tokenize.cache_info().currsize == 0
tokenize.sent_tokenize("Greetings! I am from outer space.")
assert tokenize.sent_tokenize.cache_info().currsize == 1
def test_pos_tag_caches(monkeypatch):
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
tokenize.pos_tag.cache_clear()
assert tokenize.pos_tag.cache_info().currsize == 0
tokenize.pos_tag("Greetings! I am from outer space.")
assert tokenize.pos_tag.cache_info().currsize == 1