enhancement: apply tar filters when using python 3.12 or above (#3124)

### Summary

Applies tar filters when using Python 3.12 or above. This was added to
the [Python `tarfile` library in
3.12](https://docs.python.org/3/library/tarfile.html#extraction-filters)
and guards against malicious content being extracted from `.tar.gz`
files.

### Testing

Added smoke test. If this passes for all Python versions, we're good.
This commit is contained in:
Matt Robinson 2024-06-05 14:28:59 -04:00 committed by GitHub
parent fdb27378cb
commit 0e16bf4bf0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 29 additions and 0 deletions

View File

@ -2,6 +2,8 @@
### Enhancements
* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python `tarfile` lib in Python 3.12. The change only applies when using Python 3.12 and above.
### Features
### Fixes

View File

@ -0,0 +1,15 @@
import os
import tarfile
from unstructured.ingest.utils.compression import uncompress_tar_file
def test_uncompress_tar_file(tmpdir):
tar_filename = os.path.join(tmpdir, "test.tar")
filename = "example-docs/fake-text.txt"
with tarfile.open(tar_filename, "w:gz") as tar:
tar.add(filename, arcname=os.path.basename(filename))
path = uncompress_tar_file(tar_filename, path=tmpdir.dirname)
assert path == tmpdir.dirname

View File

@ -1,5 +1,6 @@
import copy
import os
import sys
import tarfile
import zipfile
from dataclasses import dataclass
@ -63,6 +64,17 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
logger.info(f"extracting tar {tar_filename} -> {path}")
with tarfile.open(tar_filename, "r:gz") as tfile:
# NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
# This was added in Python 3.12
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
if sys.version_info >= (3, 12):
tfile.extraction_filter = tarfile.tar_filter
else:
logger.warning(
"Extraction filtering for tar files is available for Python 3.12 and above. "
"Consider upgrading your Python version to improve security. "
"See https://docs.python.org/3/library/tarfile.html#extraction-filters"
)
tfile.extractall(path=path)
return path