diff --git a/CHANGELOG.md b/CHANGELOG.md index c0e776c82..236031b0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.0-dev10 +## 0.15.0-dev11 ### Enhancements @@ -13,6 +13,8 @@ ### Fixes +* **Remedy error on Windows when `nltk` binaries are downloaded.** Work around a quirk in the Windows implementation of `tempfile.NamedTemporaryFile` where accessing the temporary file by name raises `PermissionError`. + ## 0.14.10 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9ea2af740..326c34bad 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.0-dev10" # pragma: no cover +__version__ = "0.15.0-dev11" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index a04a75d22..80641b135 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -83,20 +83,20 @@ def download_nltk_packages(): sha256.update(block) return sha256.hexdigest() - with tempfile.NamedTemporaryFile() as tmp_file: - tgz_file = tmp_file.name - urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file) + with tempfile.TemporaryDirectory() as temp_dir_path: + tgz_file_path = os.path.join(temp_dir_path, "nltk_data.tgz") + urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path) - file_hash = sha256_checksum(tgz_file) + file_hash = sha256_checksum(tgz_file_path) if file_hash != NLTK_DATA_SHA256: - os.remove(tgz_file) + os.remove(tgz_file_path) raise ValueError(f"SHA-256 mismatch: expected {NLTK_DATA_SHA256}, got {file_hash}") # Extract the contents if not os.path.exists(nltk_data_dir): os.makedirs(nltk_data_dir) - with tarfile.open(tgz_file, "r:gz") as tar: + with tarfile.open(tgz_file_path, "r:gz") as tar: tar.extractall(path=nltk_data_dir)