feat: Add MSGToDocument converter (#8868)

* Initial commit of MSG converter from Bijay * Updates to the MSG converter * Add license header * Add tests for msg converter * Update converter * Expanding tests * Update docstrings * add license header * Add reno * Add to inits and pydocs * Add test for empty input * Fix types * Fix mypy --------- Co-authored-by: Bijay Gurung <bijay.learning@gmail.com>
2025-11-14 17:13:03 +00:00 · 2025-02-24 08:12:32 +01:00 · 2025-02-24 08:12:32 +01:00 · 99a998f90b
commit 99a998f90b
parent d7dfc5222c
7 changed files with 242 additions and 0 deletions
--- a/docs/pydoc/config/converters_api.yml
+++ b/docs/pydoc/config/converters_api.yml
@ -9,6 +9,7 @@ loaders:
        "html",
        "json",
        "markdown",
        "msg",
        "openapi_functions",
        "output_adapter",
        "pdfminer",
--- a/haystack/components/converters/init.py
+++ b/haystack/components/converters/init.py
@ -14,6 +14,7 @@ _import_structure = {
    "html": ["HTMLToDocument"],
    "json": ["JSONConverter"],
    "markdown": ["MarkdownToDocument"],
    "msg": ["MSGToDocument"],
    "openapi_functions": ["OpenAPIServiceToFunctions"],
    "output_adapter": ["OutputAdapter"],
    "pdfminer": ["PDFMinerToDocument"],
@ -31,6 +32,7 @@ if TYPE_CHECKING:
    from .html import HTMLToDocument
    from .json import JSONConverter
    from .markdown import MarkdownToDocument
    from .msg import MSGToDocument
    from .openapi_functions import OpenAPIServiceToFunctions
    from .output_adapter import OutputAdapter
    from .pdfminer import PDFMinerToDocument
--- a/haystack/components/converters/msg.py
+++ b/haystack/components/converters/msg.py
@ -0,0 +1,194 @@
 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
 import io
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from haystack import Document, component, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
 with LazyImport("Run 'pip install python-oxmsg'") as oxmsg_import:
    from oxmsg import Message, recipient
 logger = logging.getLogger(__name__)
@component
 class MSGToDocument:
    """
    Converts Microsoft Outlook .msg files into Haystack Documents.
    This component extracts email metadata (such as sender, recipients, CC, BCC, subject) and body content from .msg
    files and converts them into structured Haystack Documents. Additionally, any file attachments within the .msg
    file are extracted as ByteStream objects.
    ### Example Usage
    ```python
    from haystack.components.converters.msg import MSGToDocument
    from datetime import datetime
    converter = MSGToDocument()
    results = converter.run(sources=["sample.msg"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    attachments = results["attachments"]
    print(documents[0].content)
    ```
    """
    def __init__(self, store_full_path: bool = False) -> None:
        """
        Creates a MSGToDocument component.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        oxmsg_import.check()
        self.store_full_path = store_full_path
    @staticmethod
    def _is_encrypted(msg: "Message") -> bool:
        """
        Determines whether the provided MSG file is encrypted.
        :param msg: The MSG file as a parsed Message object.
        :returns: True if the MSG file is encrypted, otherwise False.
        """
        return "encrypted" in msg.message_headers.get("Content-Type", "")
    @staticmethod
    def _create_recipient_str(recip: "recipient.Recipient") -> str:
        """
        Formats a recipient's name and email into a single string.
        :param recip: A recipient object extracted from the MSG file.
        :returns: A formatted string combining the recipient's name and email address.
        """
        recip_str = ""
        if recip.name != "":
            recip_str += f"{recip.name} "
        if recip.email_address != "":
            recip_str += f"{recip.email_address}"
        return recip_str
    def _convert(self, file_content: io.BytesIO) -> Tuple[str, List[ByteStream]]:
        """
        Converts the MSG file content into text and extracts any attachments.
        :param file_content: The MSG file content as a binary stream.
        :returns: A tuple containing the extracted email text and a list of ByteStream objects for attachments.
        :raises ValueError: If the MSG file is encrypted and cannot be read.
        """
        msg = Message.load(file_content)
        if self._is_encrypted(msg):
            raise ValueError("The MSG file is encrypted and cannot be read.")
        txt = ""
        # Sender
        if msg.sender is not None:
            txt += f"From: {msg.sender}\n"
        # To
        recipients_str = ",".join(self._create_recipient_str(r) for r in msg.recipients)
        if recipients_str != "":
            txt += f"To: {recipients_str}\n"
        # CC
        cc_header = msg.message_headers.get("Cc") or msg.message_headers.get("CC")
        if cc_header is not None:
            txt += f"Cc: {cc_header}\n"
        # BCC
        bcc_header = msg.message_headers.get("Bcc") or msg.message_headers.get("BCC")
        if bcc_header is not None:
            txt += f"Bcc: {bcc_header}\n"
        # Subject
        if msg.subject != "":
            txt += f"Subject: {msg.subject}\n"
        # Body
        if msg.body is not None:
            txt += "\n" + msg.body
        # attachments
        attachments = [
            ByteStream(
                data=attachment.file_bytes, meta={"file_path": attachment.file_name}, mime_type=attachment.mime_type
            )
            for attachment in msg.attachments
            if attachment.file_bytes is not None
        ]
        return txt, attachments
    @component.output_types(documents=List[Document], attachments=List[ByteStream])
    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
    ) -> Dict[str, Union[List[Document], List[ByteStream]]]:
        """
        Converts MSG files to Documents.
        :param sources:
            List of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
        :returns:
            A dictionary with the following keys:
            - `documents`: Created Documents.
            - `attachments`: Created ByteStream objects from file attachments.
        """
        if len(sources) == 0:
            return {"documents": [], "attachments": []}
        documents = []
        all_attachments = []
        meta_list = normalize_metadata(meta, sources_count=len(sources))
        for source, metadata in zip(sources, meta_list):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as e:
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                continue
            try:
                text, attachments = self._convert(io.BytesIO(bytestream.data))
            except Exception as e:
                logger.warning(
                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                )
                continue
            merged_metadata = {**bytestream.meta, **metadata}
            if not self.store_full_path and "file_path" in bytestream.meta:
                merged_metadata["file_path"] = os.path.basename(bytestream.meta["file_path"])
            documents.append(Document(content=text, meta=merged_metadata))
            for attachment in attachments:
                attachment_meta = {
                    **merged_metadata,
                    "parent_file_path": merged_metadata["file_path"],
                    "file_path": attachment.meta["file_path"],
                }
                all_attachments.append(
                    ByteStream(data=attachment.data, meta=attachment_meta, mime_type=attachment.mime_type)
                )
        return {"documents": documents, "attachments": all_attachments}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -113,6 +113,7 @@ extra-dependencies = [
  "jq",                               # JSONConverter
  "openpyxl",                         # XLSXToDocument
  "tabulate",                         # XLSXToDocument
  "python-oxmsg",                     # MSGToDocument
  "nltk>=3.9.1", # NLTKDocumentSplitter
--- a/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml
+++ b/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml
@ -0,0 +1,6 @@
 ---
 features:
  - |
    Add a new MSGToDocument component to convert .msg files into Haystack Document objects.
    - Extracts email metadata (e.g. sender, recipients, CC, BCC, subject) and body content into a Document.
    - Converts attachments into ByteStream objects which can be passed onto a FileTypeRouter + relevant converters.
--- a/test/components/converters/test_msg_to_document.py
+++ b/test/components/converters/test_msg_to_document.py
@ -0,0 +1,38 @@
 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
 from haystack.components.converters.msg import MSGToDocument
 class TestMSGToDocument:
    def test_run(self, test_files_path):
        converter = MSGToDocument(store_full_path=True)
        paths = [test_files_path / "msg" / "sample.msg"]
        result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
        assert len(result["documents"]) == 1
        assert result["documents"][0].content.startswith('From: "Sebastian Lee"')
        assert result["documents"][0].meta == {
            "date_added": "2021-09-01T00:00:00",
            "file_path": str(test_files_path / "msg" / "sample.msg"),
        }
        assert len(result["attachments"]) == 1
        assert result["attachments"][0].mime_type == "application/pdf"
        assert result["attachments"][0].meta == {
            "date_added": "2021-09-01T00:00:00",
            "parent_file_path": str(test_files_path / "msg" / "sample.msg"),
            "file_path": "sample_pdf_1.pdf",
        }
    def test_run_wrong_file_type(self, test_files_path, caplog):
        converter = MSGToDocument(store_full_path=False)
        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
        result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
        assert len(result["documents"]) == 0
        assert "msg_file is not an Outlook MSG file" in caplog.text
    def test_run_empty_sources(self, test_files_path):
        converter = MSGToDocument(store_full_path=False)
        result = converter.run(sources=[])
        assert len(result["documents"]) == 0
        assert len(result["attachments"]) == 0
--- a/test/test_files/msg/sample.msg
+++ b/test/test_files/msg/sample.msg