mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-29 00:50:28 +00:00
build: downgrade nltk version (#3527)
This PR aims to roll back `nltk` to `3.8.1` which bumped to `3.8.2` in https://github.com/Unstructured-IO/unstructured/pull/3512 because `3.8.2` is no longer available in PyPI due to some issues(https://github.com/nltk/nltk/issues/3301)
This commit is contained in:
parent
9b778e270d
commit
d0211cc41f
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
||||
## 0.15.5-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.
|
||||
|
||||
|
||||
## 0.15.4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -26,8 +26,7 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} && \
|
||||
dnf -y groupremove "Development Tools" && \
|
||||
dnf clean all
|
||||
|
||||
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
|
||||
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
|
||||
RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
|
||||
|
||||
FROM deps as code
|
||||
|
||||
|
||||
@ -69,7 +69,7 @@ mypy-extensions==1.0.0
|
||||
# unstructured-client
|
||||
nest-asyncio==1.6.0
|
||||
# via unstructured-client
|
||||
nltk==3.8.2
|
||||
nltk==3.8.1
|
||||
# via -r ./base.in
|
||||
numpy==1.26.4
|
||||
# via -r ./base.in
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.4" # pragma: no cover
|
||||
__version__ = "0.15.5-dev0" # pragma: no cover
|
||||
|
||||
@ -16,9 +16,9 @@ from nltk import word_tokenize as _word_tokenize
|
||||
|
||||
CACHE_MAX_SIZE: Final[int] = 128
|
||||
|
||||
NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
|
||||
NLTK_DATA_FILENAME = "nltk_data.tgz"
|
||||
NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
|
||||
NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
|
||||
NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61"
|
||||
|
||||
|
||||
# NOTE(robinson) - mimic default dir logic from NLTK
|
||||
@ -114,10 +114,10 @@ def _download_nltk_packages_if_not_present():
|
||||
|
||||
tagger_available = check_for_nltk_package(
|
||||
package_category="taggers",
|
||||
package_name="averaged_perceptron_tagger_eng",
|
||||
package_name="averaged_perceptron_tagger",
|
||||
)
|
||||
tokenizer_available = check_for_nltk_package(
|
||||
package_category="tokenizers", package_name="punkt_tab"
|
||||
package_category="tokenizers", package_name="punkt"
|
||||
)
|
||||
|
||||
if not (tokenizer_available and tagger_available):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user