diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml index 69a35d4e5..487ac5ea2 100644 --- a/docs/pydoc/config/converters_api.yml +++ b/docs/pydoc/config/converters_api.yml @@ -9,6 +9,7 @@ loaders: "html", "json", "markdown", + "msg", "openapi_functions", "output_adapter", "pdfminer", diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 2c9570b1c..6036fa6db 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -14,6 +14,7 @@ _import_structure = { "html": ["HTMLToDocument"], "json": ["JSONConverter"], "markdown": ["MarkdownToDocument"], + "msg": ["MSGToDocument"], "openapi_functions": ["OpenAPIServiceToFunctions"], "output_adapter": ["OutputAdapter"], "pdfminer": ["PDFMinerToDocument"], @@ -31,6 +32,7 @@ if TYPE_CHECKING: from .html import HTMLToDocument from .json import JSONConverter from .markdown import MarkdownToDocument + from .msg import MSGToDocument from .openapi_functions import OpenAPIServiceToFunctions from .output_adapter import OutputAdapter from .pdfminer import PDFMinerToDocument diff --git a/haystack/components/converters/msg.py b/haystack/components/converters/msg.py new file mode 100644 index 000000000..d20a7efd5 --- /dev/null +++ b/haystack/components/converters/msg.py @@ -0,0 +1,194 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import io +import os +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from haystack import Document, component, logging +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.lazy_imports import LazyImport + +with LazyImport("Run 'pip install python-oxmsg'") as oxmsg_import: + from oxmsg import Message, recipient + + +logger = logging.getLogger(__name__) + + +@component +class MSGToDocument: + """ + Converts Microsoft Outlook .msg files into Haystack Documents. + + This component extracts email metadata (such as sender, recipients, CC, BCC, subject) and body content from .msg + files and converts them into structured Haystack Documents. Additionally, any file attachments within the .msg + file are extracted as ByteStream objects. + + ### Example Usage + + ```python + from haystack.components.converters.msg import MSGToDocument + from datetime import datetime + + converter = MSGToDocument() + results = converter.run(sources=["sample.msg"], meta={"date_added": datetime.now().isoformat()}) + documents = results["documents"] + attachments = results["attachments"] + print(documents[0].content) + ``` + """ + + def __init__(self, store_full_path: bool = False) -> None: + """ + Creates a MSGToDocument component. + + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. + """ + oxmsg_import.check() + self.store_full_path = store_full_path + + @staticmethod + def _is_encrypted(msg: "Message") -> bool: + """ + Determines whether the provided MSG file is encrypted. + + :param msg: The MSG file as a parsed Message object. + :returns: True if the MSG file is encrypted, otherwise False. + """ + return "encrypted" in msg.message_headers.get("Content-Type", "") + + @staticmethod + def _create_recipient_str(recip: "recipient.Recipient") -> str: + """ + Formats a recipient's name and email into a single string. + + :param recip: A recipient object extracted from the MSG file. + :returns: A formatted string combining the recipient's name and email address. + """ + recip_str = "" + if recip.name != "": + recip_str += f"{recip.name} " + if recip.email_address != "": + recip_str += f"{recip.email_address}" + return recip_str + + def _convert(self, file_content: io.BytesIO) -> Tuple[str, List[ByteStream]]: + """ + Converts the MSG file content into text and extracts any attachments. + + :param file_content: The MSG file content as a binary stream. + :returns: A tuple containing the extracted email text and a list of ByteStream objects for attachments. + :raises ValueError: If the MSG file is encrypted and cannot be read. + """ + msg = Message.load(file_content) + if self._is_encrypted(msg): + raise ValueError("The MSG file is encrypted and cannot be read.") + + txt = "" + + # Sender + if msg.sender is not None: + txt += f"From: {msg.sender}\n" + + # To + recipients_str = ",".join(self._create_recipient_str(r) for r in msg.recipients) + if recipients_str != "": + txt += f"To: {recipients_str}\n" + + # CC + cc_header = msg.message_headers.get("Cc") or msg.message_headers.get("CC") + if cc_header is not None: + txt += f"Cc: {cc_header}\n" + + # BCC + bcc_header = msg.message_headers.get("Bcc") or msg.message_headers.get("BCC") + if bcc_header is not None: + txt += f"Bcc: {bcc_header}\n" + + # Subject + if msg.subject != "": + txt += f"Subject: {msg.subject}\n" + + # Body + if msg.body is not None: + txt += "\n" + msg.body + + # attachments + attachments = [ + ByteStream( + data=attachment.file_bytes, meta={"file_path": attachment.file_name}, mime_type=attachment.mime_type + ) + for attachment in msg.attachments + if attachment.file_bytes is not None + ] + + return txt, attachments + + @component.output_types(documents=List[Document], attachments=List[ByteStream]) + def run( + self, + sources: List[Union[str, Path, ByteStream]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ) -> Dict[str, Union[List[Document], List[ByteStream]]]: + """ + Converts MSG files to Documents. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will + be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: Created Documents. + - `attachments`: Created ByteStream objects from file attachments. + """ + if len(sources) == 0: + return {"documents": [], "attachments": []} + + documents = [] + all_attachments = [] + meta_list = normalize_metadata(meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list): + try: + bytestream = get_bytestream_from_source(source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + try: + text, attachments = self._convert(io.BytesIO(bytestream.data)) + except Exception as e: + logger.warning( + "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e + ) + continue + + merged_metadata = {**bytestream.meta, **metadata} + + if not self.store_full_path and "file_path" in bytestream.meta: + merged_metadata["file_path"] = os.path.basename(bytestream.meta["file_path"]) + + documents.append(Document(content=text, meta=merged_metadata)) + for attachment in attachments: + attachment_meta = { + **merged_metadata, + "parent_file_path": merged_metadata["file_path"], + "file_path": attachment.meta["file_path"], + } + all_attachments.append( + ByteStream(data=attachment.data, meta=attachment_meta, mime_type=attachment.mime_type) + ) + + return {"documents": documents, "attachments": all_attachments} diff --git a/pyproject.toml b/pyproject.toml index 12d3c4b02..f4175fd14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,7 @@ extra-dependencies = [ "jq", # JSONConverter "openpyxl", # XLSXToDocument "tabulate", # XLSXToDocument + "python-oxmsg", # MSGToDocument "nltk>=3.9.1", # NLTKDocumentSplitter diff --git a/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml b/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml new file mode 100644 index 000000000..df42d3cce --- /dev/null +++ b/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Add a new MSGToDocument component to convert .msg files into Haystack Document objects. + - Extracts email metadata (e.g. sender, recipients, CC, BCC, subject) and body content into a Document. + - Converts attachments into ByteStream objects which can be passed onto a FileTypeRouter + relevant converters. diff --git a/test/components/converters/test_msg_to_document.py b/test/components/converters/test_msg_to_document.py new file mode 100644 index 000000000..a69172a03 --- /dev/null +++ b/test/components/converters/test_msg_to_document.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack.components.converters.msg import MSGToDocument + + +class TestMSGToDocument: + def test_run(self, test_files_path): + converter = MSGToDocument(store_full_path=True) + paths = [test_files_path / "msg" / "sample.msg"] + result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"}) + assert len(result["documents"]) == 1 + assert result["documents"][0].content.startswith('From: "Sebastian Lee"') + assert result["documents"][0].meta == { + "date_added": "2021-09-01T00:00:00", + "file_path": str(test_files_path / "msg" / "sample.msg"), + } + assert len(result["attachments"]) == 1 + assert result["attachments"][0].mime_type == "application/pdf" + assert result["attachments"][0].meta == { + "date_added": "2021-09-01T00:00:00", + "parent_file_path": str(test_files_path / "msg" / "sample.msg"), + "file_path": "sample_pdf_1.pdf", + } + + def test_run_wrong_file_type(self, test_files_path, caplog): + converter = MSGToDocument(store_full_path=False) + paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"}) + assert len(result["documents"]) == 0 + assert "msg_file is not an Outlook MSG file" in caplog.text + + def test_run_empty_sources(self, test_files_path): + converter = MSGToDocument(store_full_path=False) + result = converter.run(sources=[]) + assert len(result["documents"]) == 0 + assert len(result["attachments"]) == 0 diff --git a/test/test_files/msg/sample.msg b/test/test_files/msg/sample.msg new file mode 100644 index 000000000..7fd917209 Binary files /dev/null and b/test/test_files/msg/sample.msg differ