2024-05-09 15:40:36 +02:00
|
|
|
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
|
|
#
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-05-26 14:41:36 +01:00
|
|
|
|
2023-08-01 11:34:52 +02:00
|
|
|
import logging
|
2024-12-10 16:03:38 +01:00
|
|
|
import os
|
2023-08-01 11:34:52 +02:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
from haystack.dataclasses import ByteStream
|
|
|
|
from haystack.components.converters.txt import TextFileToDocument
|
2023-08-01 11:34:52 +02:00
|
|
|
|
|
|
|
|
2023-11-17 14:39:39 +00:00
|
|
|
class TestTextfileToDocument:
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run(self, test_files_path):
|
2023-08-01 11:34:52 +02:00
|
|
|
"""
|
|
|
|
Test if the component runs correctly.
|
|
|
|
"""
|
2023-11-24 14:48:43 +01:00
|
|
|
bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")
|
|
|
|
bytestream.meta["key"] = "value"
|
2023-11-24 14:48:43 +01:00
|
|
|
files = [str(test_files_path / "txt" / "doc_1.txt"), test_files_path / "txt" / "doc_2.txt", bytestream]
|
2023-08-01 11:34:52 +02:00
|
|
|
converter = TextFileToDocument()
|
2023-11-17 14:39:39 +00:00
|
|
|
output = converter.run(sources=files)
|
2023-08-09 15:51:32 +02:00
|
|
|
docs = output["documents"]
|
2023-11-17 14:39:39 +00:00
|
|
|
assert len(docs) == 3
|
|
|
|
assert "Some text for testing." in docs[0].content
|
|
|
|
assert "This is a test line." in docs[1].content
|
|
|
|
assert "That's yet another file!" in docs[2].content
|
2024-12-10 16:03:38 +01:00
|
|
|
assert docs[0].meta["file_path"] == os.path.basename(files[0])
|
|
|
|
assert docs[1].meta["file_path"] == os.path.basename(files[1])
|
|
|
|
assert docs[2].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"}
|
2023-08-01 11:34:52 +02:00
|
|
|
|
2024-11-25 15:22:19 +05:00
|
|
|
def test_run_with_store_full_path(self, test_files_path):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly with store_full_path= False.
|
|
|
|
"""
|
|
|
|
bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")
|
|
|
|
bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")
|
|
|
|
bytestream.meta["key"] = "value"
|
|
|
|
files = [str(test_files_path / "txt" / "doc_1.txt"), bytestream]
|
|
|
|
converter = TextFileToDocument(store_full_path=False)
|
|
|
|
output = converter.run(sources=files)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 2
|
|
|
|
assert "Some text for testing." in docs[0].content
|
|
|
|
assert "That's yet another file!" in docs[1].content
|
|
|
|
assert docs[0].meta["file_path"] == "doc_1.txt"
|
|
|
|
assert docs[1].meta["file_path"] == "doc_3.txt"
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_error_handling(self, test_files_path, caplog):
|
2023-08-01 11:34:52 +02:00
|
|
|
"""
|
|
|
|
Test if the component correctly handles errors.
|
|
|
|
"""
|
2023-11-24 14:48:43 +01:00
|
|
|
paths = [test_files_path / "txt" / "doc_1.txt", "non_existing_file.txt", test_files_path / "txt" / "doc_3.txt"]
|
2023-08-01 11:34:52 +02:00
|
|
|
converter = TextFileToDocument()
|
|
|
|
with caplog.at_level(logging.WARNING):
|
2023-11-17 14:39:39 +00:00
|
|
|
output = converter.run(sources=paths)
|
|
|
|
assert "non_existing_file.txt" in caplog.text
|
2023-08-09 15:51:32 +02:00
|
|
|
docs = output["documents"]
|
2023-11-17 14:39:39 +00:00
|
|
|
assert len(docs) == 2
|
2024-12-10 16:03:38 +01:00
|
|
|
assert docs[0].meta["file_path"] == os.path.basename(paths[0])
|
|
|
|
assert docs[1].meta["file_path"] == os.path.basename(paths[2])
|
2023-08-01 11:34:52 +02:00
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_encoding_override(self, test_files_path):
|
2023-08-01 11:34:52 +02:00
|
|
|
"""
|
2023-11-17 14:39:39 +00:00
|
|
|
Test if the encoding metadata field is used properly
|
2023-08-01 11:34:52 +02:00
|
|
|
"""
|
2023-11-24 14:48:43 +01:00
|
|
|
bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_1.txt")
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream.meta["key"] = "value"
|
2023-08-01 11:34:52 +02:00
|
|
|
|
2023-11-17 14:39:39 +00:00
|
|
|
converter = TextFileToDocument(encoding="utf-16")
|
|
|
|
output = converter.run(sources=[bytestream])
|
|
|
|
assert "Some text for testing." not in output["documents"][0].content
|
2023-08-01 11:34:52 +02:00
|
|
|
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream.meta["encoding"] = "utf-8"
|
2023-11-17 14:39:39 +00:00
|
|
|
output = converter.run(sources=[bytestream])
|
|
|
|
assert "Some text for testing." in output["documents"][0].content
|
2023-12-15 16:41:35 +01:00
|
|
|
|
|
|
|
def test_run_with_meta(self):
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
2023-12-15 16:41:35 +01:00
|
|
|
|
|
|
|
converter = TextFileToDocument()
|
|
|
|
|
|
|
|
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
|
|
|
|
document = output["documents"][0]
|
|
|
|
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
|
|
|
assert document.meta == {"author": "test_author", "language": "it"}
|