mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

This pull request adds NLTK data to the Docker image by pre-packaging the data to ensure a more reliable and efficient deployment process, as the required NLTK resources are readily available within the container. **Current updated solution:** - Dockerfile Update: Integrated NLTK data directly into the Docker image, ensuring that the API can operate independently of external - data sources. The data is stored at /home/notebook-user/nltk_data. - Environment Variable Setup: Configured the NLTK_PATH environment variable, enabling Python scripts to automatically locate and use the embedded NLTK data. This eliminates the need for manual configuration in deployment environments. - Code Cleanup: Removed outdated code in tokenize.py and related scripts that previously downloaded NLTK data from S3. This streamlines the codebase and removes unnecessary dependencies. - Script Updates: Updated tokenize.py and test_tokenize.py to utilize the NLTK_PATH variable, ensuring consistent access to the embedded data across all environments. - Dependency Elimination: Fully eliminated reliance on the S3 bucket for NLTK data, mitigating risks from network failures or access changes. - Improved System Reliability: By embedding assets within the Docker image, the API now has a self-contained setup that ensures consistent behavior regardless of deployment location. - Updated the Dockerfile to copy the local NLTK data to the appropriate directory within the container. - Adjusted the application setup to verify the presence of NLTK assets during the container build process.
60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
from typing import List, Tuple
|
|
|
|
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
|
from unstructured.nlp import tokenize
|
|
|
|
|
|
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
|
|
pos_tags: List[Tuple[str, str]] = []
|
|
for token in tokens:
|
|
if token.lower() == "ask":
|
|
pos_tags.append((token, "VB"))
|
|
else:
|
|
pos_tags.append((token, ""))
|
|
return pos_tags
|
|
|
|
|
|
def test_pos_tag():
|
|
parts_of_speech = tokenize.pos_tag("ITEM 2A. PROPERTIES")
|
|
assert parts_of_speech == [
|
|
("ITEM", "NNP"),
|
|
("2A", "CD"),
|
|
(".", "."),
|
|
("PROPERTIES", "NN"),
|
|
]
|
|
|
|
|
|
def test_word_tokenize_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.word_tokenize.cache_clear()
|
|
assert tokenize.word_tokenize.cache_info().currsize == 0
|
|
tokenize.word_tokenize("Greetings! I am from outer space.")
|
|
assert tokenize.word_tokenize.cache_info().currsize == 1
|
|
|
|
|
|
def test_sent_tokenize_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_sent_tokenize", mock_sent_tokenize)
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.sent_tokenize.cache_clear()
|
|
assert tokenize.sent_tokenize.cache_info().currsize == 0
|
|
tokenize.sent_tokenize("Greetings! I am from outer space.")
|
|
assert tokenize.sent_tokenize.cache_info().currsize == 1
|
|
|
|
|
|
def test_pos_tag_caches(monkeypatch):
|
|
monkeypatch.setattr(tokenize, "_word_tokenize", mock_word_tokenize)
|
|
monkeypatch.setattr(tokenize, "_pos_tag", mock_pos_tag)
|
|
tokenize.pos_tag.cache_clear()
|
|
assert tokenize.pos_tag.cache_info().currsize == 0
|
|
tokenize.pos_tag("Greetings! I am from outer space.")
|
|
assert tokenize.pos_tag.cache_info().currsize == 1
|
|
|
|
|
|
def test_tokenizers_functions_run():
|
|
sentence = "I am a big brown bear. What are you?"
|
|
tokenize.sent_tokenize(sentence)
|
|
tokenize.word_tokenize(sentence)
|
|
tokenize.pos_tag(sentence)
|