mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Fix attachments with =
in filename (#1110)
Fix attachments with = in filename * Limit split to first match of = to prevent creating a list of more than two parts * Add example email with attachment name and test for issue
This commit is contained in:
parent
fc2699ff06
commit
2e0ab86c6a
@ -17,6 +17,7 @@
|
|||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* make notion module discoverable
|
* make notion module discoverable
|
||||||
|
* Fix email attachment filenames which had `=` in the filename itself
|
||||||
|
|
||||||
## 0.9.2
|
## 0.9.2
|
||||||
|
|
||||||
|
67
example-docs/eml/email-equals-attachment-filename.eml
Normal file
67
example-docs/eml/email-equals-attachment-filename.eml
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
Return-Path: <testuser@example.com>
|
||||||
|
Delivered-To: recipient@example.com
|
||||||
|
Received: from mail-il1-x135.google.com (mail-il1-x135.google.com [IPv6:2607:f8b0:4864:20::135])
|
||||||
|
by spool.mail.gandi.net (Postfix) with ESMTPS id 30071740049
|
||||||
|
for <recipient@example.com>; Sun, 13 Aug 2023 22:00:09 +0000 (UTC)
|
||||||
|
Received: by mail-il1-x135.google.com with SMTP id e9e14a558f8ab-34aa0845837so895295ab.1
|
||||||
|
for <recipient@example.com>; Sun, 13 Aug 2023 15:00:09 -0700 (PDT)
|
||||||
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
||||||
|
d=gmail.com; s=20221208; t=1691964008; x=1692568808;
|
||||||
|
h=to:subject:message-id:date:from:mime-version:from:to:cc:subject
|
||||||
|
:date:message-id:reply-to;
|
||||||
|
bh=u2zZbdTcme/MRpud6yh6mKzbHh7iBKn7qvZ1YZJcZuQ=;
|
||||||
|
b=LxHBRFvl8tDcIithe7Il7GC7rAEu5QHGoko+PZll4SUDgh0gYHu35ksEuMO3bBT3sB
|
||||||
|
UGM5/Obbn+17F+DL0Mk/Zyc/6gG15lNMVLcr9+Fzjt2hDkrcUsEAkmS9chFiF0asGebj
|
||||||
|
F3vn1FJ9ZDi3IISHeD80PzmhT23Zp4ELjrfEGv2go7Psb320wzL58mHObkhz2spXEK0c
|
||||||
|
YzlCkJd8hBz2wI5mKedzf4mLdbTUZhPpmycvS+NkNwxQzaMXouLEkBvOXticqPQHvbTe
|
||||||
|
IiTb2JsaTFEJCfDVjhzIuGA6fFqNmH7hz7Fjh6eW66msB2QCIAhWHIIQ0Uy0Lx0FaQeo
|
||||||
|
pA5w==
|
||||||
|
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
||||||
|
d=1e100.net; s=20221208; t=1691964008; x=1692568808;
|
||||||
|
h=to:subject:message-id:date:from:mime-version:x-gm-message-state
|
||||||
|
:from:to:cc:subject:date:message-id:reply-to;
|
||||||
|
bh=u2zZbdTcme/MRpud6yh6mKzbHh7iBKn7qvZ1YZJcZuQ=;
|
||||||
|
b=InxJceRYlLILD0JdetCcOea42zYYLr8BYlhXxpB07TuFAiKbIV28vxmp7XaEa6YIuA
|
||||||
|
IkpNpvLWrlRJpLsKvJF9QRdIAt83p+91zvd2BW8M0P7AP04KofvzGbbCFG67tjr23K7S
|
||||||
|
YYuVSIXgVli3sjbIMsxq/JnaHWk1fnrGBvpnMLEEekqsdXpyL5GJ0yN0Qb/4lBZO1uO9
|
||||||
|
oZ0gbwqEMA/eHAnpH5W/g9ubkVcXzfjSPCRzzNhXfOEGn3Cc5sAuEH03iVuVIKMe9FJg
|
||||||
|
sO5iyah9+tjnm1NBWCk2qSIuCJrA0YvqcoztgpmJYDDQtG6scHRL83DdMx7phwRlVd/l
|
||||||
|
S6rQ==
|
||||||
|
X-Gm-Message-State: AOJu0YzoTpbToiITeHpRUQB8Tc5krfAtkhP2TRgs0WdgPAgfeUixZft6
|
||||||
|
vGUz3KcsN2V+qf2+RQPiveSjelXe81VfycqaH+I2hUNd
|
||||||
|
X-Google-Smtp-Source: AGHT+IFzHJ5xiuLxHriivr/CAV7z2Qo6Jep/LEhlzu4GiHEoXTFGC1DZ/MTDROwUz3fXKlKLU6uBzylF4XSOdKWfTW8=
|
||||||
|
X-Received: by 2002:a05:6e02:1bee:b0:349:2d1d:e463 with SMTP id
|
||||||
|
y14-20020a056e021bee00b003492d1de463mr12404311ilv.13.1691964008036; Sun, 13
|
||||||
|
Aug 2023 15:00:08 -0700 (PDT)
|
||||||
|
MIME-Version: 1.0
|
||||||
|
From: Test User <testuser@example.com>
|
||||||
|
Date: Sun, 13 Aug 2023 14:59:56 -0700
|
||||||
|
Message-ID: <CABBgHeGAW=UW77EE7p4CsCuaudixAYUU8iPqsm3=4921Wc9Vxw@mail.gmail.com>
|
||||||
|
Subject: Odd filename example
|
||||||
|
To: recipient@example.com
|
||||||
|
Content-Type: multipart/mixed; boundary="000000000000ac11b20602d51124"
|
||||||
|
X-GND-Status: LEGIT
|
||||||
|
|
||||||
|
--000000000000ac11b20602d51124
|
||||||
|
Content-Type: multipart/alternative; boundary="000000000000ac11b10602d51122"
|
||||||
|
|
||||||
|
--000000000000ac11b10602d51122
|
||||||
|
Content-Type: text/plain; charset="UTF-8"
|
||||||
|
|
||||||
|
Below is an example of an odd filename
|
||||||
|
|
||||||
|
--000000000000ac11b10602d51122
|
||||||
|
Content-Type: text/html; charset="UTF-8"
|
||||||
|
|
||||||
|
<div dir="ltr">Below is an example of an odd filename</div>
|
||||||
|
|
||||||
|
--000000000000ac11b10602d51122--
|
||||||
|
--000000000000ac11b20602d51124
|
||||||
|
Content-Type: text/plain; charset="US-ASCII"; name="odd=file=name.txt"
|
||||||
|
Content-Disposition: attachment; filename="odd=file=name.txt"
|
||||||
|
Content-Transfer-Encoding: base64
|
||||||
|
Content-ID: <f_ll9zod670>
|
||||||
|
X-Attachment-Id: f_ll9zod670
|
||||||
|
|
||||||
|
T2RkIGZpbGVuYW1lCg==
|
||||||
|
--000000000000ac11b20602d51124--
|
@ -491,3 +491,15 @@ def test_partition_email_custom_metadata_date(
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_odd_attachment_filename(
|
||||||
|
filename="example-docs/eml/email-equals-attachment-filename.eml",
|
||||||
|
):
|
||||||
|
elements = partition_email(
|
||||||
|
filename=filename,
|
||||||
|
process_attachments=True,
|
||||||
|
attachment_partitioner=partition_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert elements[1].metadata.filename == "odd=file=name.txt"
|
||||||
|
@ -168,7 +168,7 @@ def extract_attachment_info(
|
|||||||
|
|
||||||
if item.lower() == "attachment":
|
if item.lower() == "attachment":
|
||||||
continue
|
continue
|
||||||
key, value = item.split("=")
|
key, value = item.split("=", 1)
|
||||||
key = clean_extra_whitespace(key.replace('"', ""))
|
key = clean_extra_whitespace(key.replace('"', ""))
|
||||||
value = clean_extra_whitespace(value.replace('"', ""))
|
value = clean_extra_whitespace(value.replace('"', ""))
|
||||||
attachment_info[clean_extra_whitespace(key)] = clean_extra_whitespace(
|
attachment_info[clean_extra_whitespace(key)] = clean_extra_whitespace(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user