mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-08 13:20:31 +00:00
enhancement: apply tar filters when using python 3.12 or above (#3124)
### Summary Applies tar filters when using Python 3.12 or above. This was added to the [Python `tarfile` library in 3.12](https://docs.python.org/3/library/tarfile.html#extraction-filters) and guards against malicious content being extracted from `.tar.gz` files. ### Testing Added smoke test. If this passes for all Python versions, we're good.
This commit is contained in:
parent
fdb27378cb
commit
0e16bf4bf0
@ -2,6 +2,8 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python `tarfile` lib in Python 3.12. The change only applies when using Python 3.12 and above.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
15
test_unstructured/ingest/utils/test_compression.py
Normal file
15
test_unstructured/ingest/utils/test_compression.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
from unstructured.ingest.utils.compression import uncompress_tar_file
|
||||
|
||||
|
||||
def test_uncompress_tar_file(tmpdir):
|
||||
tar_filename = os.path.join(tmpdir, "test.tar")
|
||||
filename = "example-docs/fake-text.txt"
|
||||
|
||||
with tarfile.open(tar_filename, "w:gz") as tar:
|
||||
tar.add(filename, arcname=os.path.basename(filename))
|
||||
|
||||
path = uncompress_tar_file(tar_filename, path=tmpdir.dirname)
|
||||
assert path == tmpdir.dirname
|
||||
@ -1,5 +1,6 @@
|
||||
import copy
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
@ -63,6 +64,17 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
||||
path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
|
||||
logger.info(f"extracting tar {tar_filename} -> {path}")
|
||||
with tarfile.open(tar_filename, "r:gz") as tfile:
|
||||
# NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
|
||||
# This was added in Python 3.12
|
||||
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
|
||||
if sys.version_info >= (3, 12):
|
||||
tfile.extraction_filter = tarfile.tar_filter
|
||||
else:
|
||||
logger.warning(
|
||||
"Extraction filtering for tar files is available for Python 3.12 and above. "
|
||||
"Consider upgrading your Python version to improve security. "
|
||||
"See https://docs.python.org/3/library/tarfile.html#extraction-filters"
|
||||
)
|
||||
tfile.extractall(path=path)
|
||||
return path
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user