mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-14 09:03:34 +00:00
* Initial commit of MSG converter from Bijay * Updates to the MSG converter * Add license header * Add tests for msg converter * Update converter * Expanding tests * Update docstrings * add license header * Add reno * Add to inits and pydocs * Add test for empty input * Fix types * Fix mypy --------- Co-authored-by: Bijay Gurung <bijay.learning@gmail.com>
39 lines
1.7 KiB
Python
39 lines
1.7 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
from haystack.components.converters.msg import MSGToDocument
|
|
|
|
|
|
class TestMSGToDocument:
|
|
def test_run(self, test_files_path):
|
|
converter = MSGToDocument(store_full_path=True)
|
|
paths = [test_files_path / "msg" / "sample.msg"]
|
|
result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
|
|
assert len(result["documents"]) == 1
|
|
assert result["documents"][0].content.startswith('From: "Sebastian Lee"')
|
|
assert result["documents"][0].meta == {
|
|
"date_added": "2021-09-01T00:00:00",
|
|
"file_path": str(test_files_path / "msg" / "sample.msg"),
|
|
}
|
|
assert len(result["attachments"]) == 1
|
|
assert result["attachments"][0].mime_type == "application/pdf"
|
|
assert result["attachments"][0].meta == {
|
|
"date_added": "2021-09-01T00:00:00",
|
|
"parent_file_path": str(test_files_path / "msg" / "sample.msg"),
|
|
"file_path": "sample_pdf_1.pdf",
|
|
}
|
|
|
|
def test_run_wrong_file_type(self, test_files_path, caplog):
|
|
converter = MSGToDocument(store_full_path=False)
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
|
|
assert len(result["documents"]) == 0
|
|
assert "msg_file is not an Outlook MSG file" in caplog.text
|
|
|
|
def test_run_empty_sources(self, test_files_path):
|
|
converter = MSGToDocument(store_full_path=False)
|
|
result = converter.run(sources=[])
|
|
assert len(result["documents"]) == 0
|
|
assert len(result["attachments"]) == 0
|