mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-14 00:54:22 +00:00
feat: Add MSGToDocument converter (#8868)
* Initial commit of MSG converter from Bijay * Updates to the MSG converter * Add license header * Add tests for msg converter * Update converter * Expanding tests * Update docstrings * add license header * Add reno * Add to inits and pydocs * Add test for empty input * Fix types * Fix mypy --------- Co-authored-by: Bijay Gurung <bijay.learning@gmail.com>
This commit is contained in:
parent
d7dfc5222c
commit
99a998f90b
@ -9,6 +9,7 @@ loaders:
|
||||
"html",
|
||||
"json",
|
||||
"markdown",
|
||||
"msg",
|
||||
"openapi_functions",
|
||||
"output_adapter",
|
||||
"pdfminer",
|
||||
|
||||
@ -14,6 +14,7 @@ _import_structure = {
|
||||
"html": ["HTMLToDocument"],
|
||||
"json": ["JSONConverter"],
|
||||
"markdown": ["MarkdownToDocument"],
|
||||
"msg": ["MSGToDocument"],
|
||||
"openapi_functions": ["OpenAPIServiceToFunctions"],
|
||||
"output_adapter": ["OutputAdapter"],
|
||||
"pdfminer": ["PDFMinerToDocument"],
|
||||
@ -31,6 +32,7 @@ if TYPE_CHECKING:
|
||||
from .html import HTMLToDocument
|
||||
from .json import JSONConverter
|
||||
from .markdown import MarkdownToDocument
|
||||
from .msg import MSGToDocument
|
||||
from .openapi_functions import OpenAPIServiceToFunctions
|
||||
from .output_adapter import OutputAdapter
|
||||
from .pdfminer import PDFMinerToDocument
|
||||
|
||||
194
haystack/components/converters/msg.py
Normal file
194
haystack/components/converters/msg.py
Normal file
@ -0,0 +1,194 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import io
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from haystack import Document, component, logging
|
||||
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack.lazy_imports import LazyImport
|
||||
|
||||
with LazyImport("Run 'pip install python-oxmsg'") as oxmsg_import:
|
||||
from oxmsg import Message, recipient
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class MSGToDocument:
|
||||
"""
|
||||
Converts Microsoft Outlook .msg files into Haystack Documents.
|
||||
|
||||
This component extracts email metadata (such as sender, recipients, CC, BCC, subject) and body content from .msg
|
||||
files and converts them into structured Haystack Documents. Additionally, any file attachments within the .msg
|
||||
file are extracted as ByteStream objects.
|
||||
|
||||
### Example Usage
|
||||
|
||||
```python
|
||||
from haystack.components.converters.msg import MSGToDocument
|
||||
from datetime import datetime
|
||||
|
||||
converter = MSGToDocument()
|
||||
results = converter.run(sources=["sample.msg"], meta={"date_added": datetime.now().isoformat()})
|
||||
documents = results["documents"]
|
||||
attachments = results["attachments"]
|
||||
print(documents[0].content)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, store_full_path: bool = False) -> None:
|
||||
"""
|
||||
Creates a MSGToDocument component.
|
||||
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
oxmsg_import.check()
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
@staticmethod
|
||||
def _is_encrypted(msg: "Message") -> bool:
|
||||
"""
|
||||
Determines whether the provided MSG file is encrypted.
|
||||
|
||||
:param msg: The MSG file as a parsed Message object.
|
||||
:returns: True if the MSG file is encrypted, otherwise False.
|
||||
"""
|
||||
return "encrypted" in msg.message_headers.get("Content-Type", "")
|
||||
|
||||
@staticmethod
|
||||
def _create_recipient_str(recip: "recipient.Recipient") -> str:
|
||||
"""
|
||||
Formats a recipient's name and email into a single string.
|
||||
|
||||
:param recip: A recipient object extracted from the MSG file.
|
||||
:returns: A formatted string combining the recipient's name and email address.
|
||||
"""
|
||||
recip_str = ""
|
||||
if recip.name != "":
|
||||
recip_str += f"{recip.name} "
|
||||
if recip.email_address != "":
|
||||
recip_str += f"{recip.email_address}"
|
||||
return recip_str
|
||||
|
||||
def _convert(self, file_content: io.BytesIO) -> Tuple[str, List[ByteStream]]:
|
||||
"""
|
||||
Converts the MSG file content into text and extracts any attachments.
|
||||
|
||||
:param file_content: The MSG file content as a binary stream.
|
||||
:returns: A tuple containing the extracted email text and a list of ByteStream objects for attachments.
|
||||
:raises ValueError: If the MSG file is encrypted and cannot be read.
|
||||
"""
|
||||
msg = Message.load(file_content)
|
||||
if self._is_encrypted(msg):
|
||||
raise ValueError("The MSG file is encrypted and cannot be read.")
|
||||
|
||||
txt = ""
|
||||
|
||||
# Sender
|
||||
if msg.sender is not None:
|
||||
txt += f"From: {msg.sender}\n"
|
||||
|
||||
# To
|
||||
recipients_str = ",".join(self._create_recipient_str(r) for r in msg.recipients)
|
||||
if recipients_str != "":
|
||||
txt += f"To: {recipients_str}\n"
|
||||
|
||||
# CC
|
||||
cc_header = msg.message_headers.get("Cc") or msg.message_headers.get("CC")
|
||||
if cc_header is not None:
|
||||
txt += f"Cc: {cc_header}\n"
|
||||
|
||||
# BCC
|
||||
bcc_header = msg.message_headers.get("Bcc") or msg.message_headers.get("BCC")
|
||||
if bcc_header is not None:
|
||||
txt += f"Bcc: {bcc_header}\n"
|
||||
|
||||
# Subject
|
||||
if msg.subject != "":
|
||||
txt += f"Subject: {msg.subject}\n"
|
||||
|
||||
# Body
|
||||
if msg.body is not None:
|
||||
txt += "\n" + msg.body
|
||||
|
||||
# attachments
|
||||
attachments = [
|
||||
ByteStream(
|
||||
data=attachment.file_bytes, meta={"file_path": attachment.file_name}, mime_type=attachment.mime_type
|
||||
)
|
||||
for attachment in msg.attachments
|
||||
if attachment.file_bytes is not None
|
||||
]
|
||||
|
||||
return txt, attachments
|
||||
|
||||
@component.output_types(documents=List[Document], attachments=List[ByteStream])
|
||||
def run(
|
||||
self,
|
||||
sources: List[Union[str, Path, ByteStream]],
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
) -> Dict[str, Union[List[Document], List[ByteStream]]]:
|
||||
"""
|
||||
Converts MSG files to Documents.
|
||||
|
||||
:param sources:
|
||||
List of file paths or ByteStream objects.
|
||||
:param meta:
|
||||
Optional metadata to attach to the Documents.
|
||||
This value can be either a list of dictionaries or a single dictionary.
|
||||
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
||||
If it's a list, the length of the list must match the number of sources, because the two lists will
|
||||
be zipped.
|
||||
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
|
||||
|
||||
:returns:
|
||||
A dictionary with the following keys:
|
||||
- `documents`: Created Documents.
|
||||
- `attachments`: Created ByteStream objects from file attachments.
|
||||
"""
|
||||
if len(sources) == 0:
|
||||
return {"documents": [], "attachments": []}
|
||||
|
||||
documents = []
|
||||
all_attachments = []
|
||||
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
||||
|
||||
for source, metadata in zip(sources, meta_list):
|
||||
try:
|
||||
bytestream = get_bytestream_from_source(source)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
||||
continue
|
||||
try:
|
||||
text, attachments = self._convert(io.BytesIO(bytestream.data))
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
|
||||
)
|
||||
continue
|
||||
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
|
||||
if not self.store_full_path and "file_path" in bytestream.meta:
|
||||
merged_metadata["file_path"] = os.path.basename(bytestream.meta["file_path"])
|
||||
|
||||
documents.append(Document(content=text, meta=merged_metadata))
|
||||
for attachment in attachments:
|
||||
attachment_meta = {
|
||||
**merged_metadata,
|
||||
"parent_file_path": merged_metadata["file_path"],
|
||||
"file_path": attachment.meta["file_path"],
|
||||
}
|
||||
all_attachments.append(
|
||||
ByteStream(data=attachment.data, meta=attachment_meta, mime_type=attachment.mime_type)
|
||||
)
|
||||
|
||||
return {"documents": documents, "attachments": all_attachments}
|
||||
@ -113,6 +113,7 @@ extra-dependencies = [
|
||||
"jq", # JSONConverter
|
||||
"openpyxl", # XLSXToDocument
|
||||
"tabulate", # XLSXToDocument
|
||||
"python-oxmsg", # MSGToDocument
|
||||
|
||||
"nltk>=3.9.1", # NLTKDocumentSplitter
|
||||
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add a new MSGToDocument component to convert .msg files into Haystack Document objects.
|
||||
- Extracts email metadata (e.g. sender, recipients, CC, BCC, subject) and body content into a Document.
|
||||
- Converts attachments into ByteStream objects which can be passed onto a FileTypeRouter + relevant converters.
|
||||
38
test/components/converters/test_msg_to_document.py
Normal file
38
test/components/converters/test_msg_to_document.py
Normal file
@ -0,0 +1,38 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from haystack.components.converters.msg import MSGToDocument
|
||||
|
||||
|
||||
class TestMSGToDocument:
|
||||
def test_run(self, test_files_path):
|
||||
converter = MSGToDocument(store_full_path=True)
|
||||
paths = [test_files_path / "msg" / "sample.msg"]
|
||||
result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
|
||||
assert len(result["documents"]) == 1
|
||||
assert result["documents"][0].content.startswith('From: "Sebastian Lee"')
|
||||
assert result["documents"][0].meta == {
|
||||
"date_added": "2021-09-01T00:00:00",
|
||||
"file_path": str(test_files_path / "msg" / "sample.msg"),
|
||||
}
|
||||
assert len(result["attachments"]) == 1
|
||||
assert result["attachments"][0].mime_type == "application/pdf"
|
||||
assert result["attachments"][0].meta == {
|
||||
"date_added": "2021-09-01T00:00:00",
|
||||
"parent_file_path": str(test_files_path / "msg" / "sample.msg"),
|
||||
"file_path": "sample_pdf_1.pdf",
|
||||
}
|
||||
|
||||
def test_run_wrong_file_type(self, test_files_path, caplog):
|
||||
converter = MSGToDocument(store_full_path=False)
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
|
||||
assert len(result["documents"]) == 0
|
||||
assert "msg_file is not an Outlook MSG file" in caplog.text
|
||||
|
||||
def test_run_empty_sources(self, test_files_path):
|
||||
converter = MSGToDocument(store_full_path=False)
|
||||
result = converter.run(sources=[])
|
||||
assert len(result["documents"]) == 0
|
||||
assert len(result["attachments"]) == 0
|
||||
BIN
test/test_files/msg/sample.msg
Normal file
BIN
test/test_files/msg/sample.msg
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user