build: downgrade nltk version (#3527)

This PR aims to roll back `nltk` to `3.8.1` which bumped to `3.8.2` in
https://github.com/Unstructured-IO/unstructured/pull/3512 because
`3.8.2` is no longer available in PyPI due to some
issues(https://github.com/nltk/nltk/issues/3301)
This commit is contained in:
Christine Straub 2024-08-15 16:35:21 -07:00 committed by GitHub
parent 9b778e270d
commit d0211cc41f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 18 additions and 8 deletions

View File

@ -1,3 +1,14 @@
## 0.15.5-dev0
### Enhancements
### Features
### Fixes
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.
## 0.15.4
### Enhancements

View File

@ -26,8 +26,7 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} && \
dnf -y groupremove "Development Tools" && \
dnf clean all
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
FROM deps as code

View File

@ -69,7 +69,7 @@ mypy-extensions==1.0.0
# unstructured-client
nest-asyncio==1.6.0
# via unstructured-client
nltk==3.8.2
nltk==3.8.1
# via -r ./base.in
numpy==1.26.4
# via -r ./base.in

View File

@ -1 +1 @@
__version__ = "0.15.4" # pragma: no cover
__version__ = "0.15.5-dev0" # pragma: no cover

View File

@ -16,9 +16,9 @@ from nltk import word_tokenize as _word_tokenize
CACHE_MAX_SIZE: Final[int] = 128
NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
NLTK_DATA_FILENAME = "nltk_data.tgz"
NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61"
# NOTE(robinson) - mimic default dir logic from NLTK
@ -114,10 +114,10 @@ def _download_nltk_packages_if_not_present():
tagger_available = check_for_nltk_package(
package_category="taggers",
package_name="averaged_perceptron_tagger_eng",
package_name="averaged_perceptron_tagger",
)
tokenizer_available = check_for_nltk_package(
package_category="tokenizers", package_name="punkt_tab"
package_category="tokenizers", package_name="punkt"
)
if not (tokenizer_available and tagger_available):