From 0057f9dea80e8fd38f07c862ba00f839ff29b208 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 15 Jul 2024 18:58:32 -0700 Subject: [PATCH] fix(nltk): remedy Windows temp-file problem (#3395) **Summary** The implementation of `tempfile.NamedTemporaryFile` on Windows Python is problematic in certain situations. In particular, it raises `PermissionError` when attempting to access the temporary file by name rather than just by the file-descriptor returned by the context-manager. Remedy this situation by using `tempfile.TemporaryDirectory` instead and using a file name of our choosing. The temporary directory is deleted with all its contents when the context manager closes so the effect is the same and does not produce the error on Windows. --- CHANGELOG.md | 4 +++- unstructured/__version__.py | 2 +- unstructured/nlp/tokenize.py | 12 ++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0e776c82..236031b0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.0-dev10 +## 0.15.0-dev11 ### Enhancements @@ -13,6 +13,8 @@ ### Fixes +* **Remedy error on Windows when `nltk` binaries are downloaded.** Work around a quirk in the Windows implementation of `tempfile.NamedTemporaryFile` where accessing the temporary file by name raises `PermissionError`. + ## 0.14.10 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9ea2af740..326c34bad 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.0-dev10" # pragma: no cover +__version__ = "0.15.0-dev11" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index a04a75d22..80641b135 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -83,20 +83,20 @@ def download_nltk_packages(): sha256.update(block) return sha256.hexdigest() - with tempfile.NamedTemporaryFile() as tmp_file: - tgz_file = tmp_file.name - urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file) + with tempfile.TemporaryDirectory() as temp_dir_path: + tgz_file_path = os.path.join(temp_dir_path, "nltk_data.tgz") + urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path) - file_hash = sha256_checksum(tgz_file) + file_hash = sha256_checksum(tgz_file_path) if file_hash != NLTK_DATA_SHA256: - os.remove(tgz_file) + os.remove(tgz_file_path) raise ValueError(f"SHA-256 mismatch: expected {NLTK_DATA_SHA256}, got {file_hash}") # Extract the contents if not os.path.exists(nltk_data_dir): os.makedirs(nltk_data_dir) - with tarfile.open(tgz_file, "r:gz") as tar: + with tarfile.open(tgz_file_path, "r:gz") as tar: tar.extractall(path=nltk_data_dir)