diff --git a/CHANGELOG.md b/CHANGELOG.md index 21c86340d..1e07fb69e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.15.7 + +### Enhancements + +### Features + +### Fixes + +* **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data. + ## 0.15.6 ### Enhancements @@ -10,7 +20,6 @@ * **Update CI for `ingest-test-fixture-update-pr` to resolve NLTK model download errors.** * **Synchronized text and html on `TableChunk` splits.** When a `Table` element is divided during chunking to fit the chunking window, `TableChunk.text` corresponds exactly with the table text in `TableChunk.metadata.text_as_html`, `.text_as_html` is always parseable HTML, and the table is split on even row boundaries whenever possible. - ## 0.15.5 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 986e0018a..d57a4f171 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.6" # pragma: no cover +__version__ = "0.15.7" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index fe39e3a77..9f438ed1d 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -70,6 +70,10 @@ def download_nltk_packages(): if nltk_data_dir is None: raise OSError("NLTK data directory does not exist or is not writable.") + # Check if the path ends with "nltk_data" and remove it if it does + if nltk_data_dir.endswith("nltk_data"): + nltk_data_dir = os.path.dirname(nltk_data_dir) + def sha256_checksum(filename: str, block_size: int = 65536): sha256 = hashlib.sha256() with open(filename, "rb") as f: