diff --git a/CHANGELOG.md b/CHANGELOG.md index 99b9ea1c2..acec950d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.18.18 + +### Fixes +- **Prevent path traversal in email MSG attachment filenames** Fixed a security vulnerability (GHSA-gm8q-m8mv-jj5m) where malicious attachment filenames containing path traversal sequences could write files outside the intended directory. The fix normalizes both Unix and Windows path separators before sanitizing filenames, preventing cross-platform path traversal attacks in `partition_msg` functions ## 0.18.17 ### Enhancement diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index 94b12d557..2d77ad613 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -309,6 +309,153 @@ def test_partition_msg_raises_TypeError_for_invalid_languages(): # ================================================================================================ +class DescribeMsgAttachmentFilenameSanitization: + """Unit-test suite for filename sanitization in MSG attachments (GHSA-gm8q-m8mv-jj5m).""" + + def it_sanitizes_path_traversal_attempts(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "../../../etc/passwd" + attachment.file_bytes = b"malicious content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "passwd" + + def it_sanitizes_absolute_unix_paths(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "/etc/passwd" + attachment.file_bytes = b"malicious content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "passwd" + + def it_sanitizes_absolute_windows_paths(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "C:\\Windows\\System32\\config\\sam" + attachment.file_bytes = b"malicious content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "sam" + + def it_removes_null_bytes_from_filenames(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "file\x00.txt" + attachment.file_bytes = b"content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "file.txt" + assert "\x00" not in partitioner._attachment_file_name + + def it_handles_dot_and_dotdot_filenames(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + opts = Mock() + opts.metadata_last_modified = None + + # Test single dot + attachment1 = Mock() + attachment1.file_name = "." + attachment1.file_bytes = b"content" + attachment1.last_modified = None + partitioner1 = _AttachmentPartitioner(attachment1, opts) + assert partitioner1._attachment_file_name == "unknown" + + # Test double dot + attachment2 = Mock() + attachment2.file_name = ".." + attachment2.file_bytes = b"content" + attachment2.last_modified = None + partitioner2 = _AttachmentPartitioner(attachment2, opts) + assert partitioner2._attachment_file_name == "unknown" + + def it_handles_missing_filename(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = None + attachment.file_bytes = b"content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "unknown" + + def it_allows_valid_filenames_through(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "document.pdf" + attachment.file_bytes = b"content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "document.pdf" + + def it_handles_complex_path_traversal_with_mixed_separators(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "..\\../\\..\\etc/passwd" + attachment.file_bytes = b"malicious content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "passwd" + + def it_handles_empty_string_filename(self, request: FixtureRequest): + from unstructured.partition.msg import _AttachmentPartitioner + + attachment = Mock() + attachment.file_name = "" + attachment.file_bytes = b"content" + attachment.last_modified = None + + opts = Mock() + opts.metadata_last_modified = None + + partitioner = _AttachmentPartitioner(attachment, opts) + + assert partitioner._attachment_file_name == "unknown" + + class DescribeMsgPartitionerOptions: """Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects.""" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 76895ff98..4cd140462 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.17" # pragma: no cover +__version__ = "0.18.18" # pragma: no cover diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 7c43f4667..d34340934 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -279,8 +279,23 @@ class _AttachmentPartitioner: """The original name of the attached file, no path. This value is 'unknown' if it is not present in the MSG file (not expected). + The filename is sanitized to prevent path traversal attacks. """ - return self._attachment.file_name or "unknown" + raw_filename = self._attachment.file_name or "unknown" + + # Sanitize the filename to prevent path traversal attacks + # Remove any path components for both Unix and Windows paths + # Use both separators to handle cross-platform attacks + safe_filename = os.path.basename(raw_filename.replace("\\", "/")) + + # Remove null bytes and other control characters + safe_filename = safe_filename.replace("\0", "") + + # If the filename becomes empty after sanitization, use a default + if not safe_filename or safe_filename in (".", ".."): + safe_filename = "unknown" + + return safe_filename @lazyproperty def _attachment_last_modified(self) -> str | None: