mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-17 12:27:45 +00:00
fix(nltk): remedy Windows temp-file problem (#3395)
**Summary** The implementation of `tempfile.NamedTemporaryFile` on Windows Python is problematic in certain situations. In particular, it raises `PermissionError` when attempting to access the temporary file by name rather than just by the file-descriptor returned by the context-manager. Remedy this situation by using `tempfile.TemporaryDirectory` instead and using a file name of our choosing. The temporary directory is deleted with all its contents when the context manager closes so the effect is the same and does not produce the error on Windows.
This commit is contained in:
parent
e8b2297cbb
commit
0057f9dea8
@ -1,4 +1,4 @@
|
||||
## 0.15.0-dev10
|
||||
## 0.15.0-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -13,6 +13,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Remedy error on Windows when `nltk` binaries are downloaded.** Work around a quirk in the Windows implementation of `tempfile.NamedTemporaryFile` where accessing the temporary file by name raises `PermissionError`.
|
||||
|
||||
## 0.14.10
|
||||
|
||||
### Enhancements
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.0-dev10" # pragma: no cover
|
||||
__version__ = "0.15.0-dev11" # pragma: no cover
|
||||
|
@ -83,20 +83,20 @@ def download_nltk_packages():
|
||||
sha256.update(block)
|
||||
return sha256.hexdigest()
|
||||
|
||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
||||
tgz_file = tmp_file.name
|
||||
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file)
|
||||
with tempfile.TemporaryDirectory() as temp_dir_path:
|
||||
tgz_file_path = os.path.join(temp_dir_path, "nltk_data.tgz")
|
||||
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
|
||||
|
||||
file_hash = sha256_checksum(tgz_file)
|
||||
file_hash = sha256_checksum(tgz_file_path)
|
||||
if file_hash != NLTK_DATA_SHA256:
|
||||
os.remove(tgz_file)
|
||||
os.remove(tgz_file_path)
|
||||
raise ValueError(f"SHA-256 mismatch: expected {NLTK_DATA_SHA256}, got {file_hash}")
|
||||
|
||||
# Extract the contents
|
||||
if not os.path.exists(nltk_data_dir):
|
||||
os.makedirs(nltk_data_dir)
|
||||
|
||||
with tarfile.open(tgz_file, "r:gz") as tar:
|
||||
with tarfile.open(tgz_file_path, "r:gz") as tar:
|
||||
tar.extractall(path=nltk_data_dir)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user