haystack/test/components/converters/test_csv_to_document.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch
import pandas as pd
from pathlib import Path

import pytest

from haystack.dataclasses import ByteStream
from haystack.components.converters.csv import CSVToDocument


@pytest.fixture
def csv_converter():
    return CSVToDocument()


class TestCSVToDocument:
    def test_init(self, csv_converter):
        assert isinstance(csv_converter, CSVToDocument)

    def test_run(self, test_files_path):
        """
        Test if the component runs correctly.
        """
        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
        bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
        bytestream.meta["key"] = "value"
        files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
        converter = CSVToDocument()
        output = converter.run(sources=files)
        docs = output["documents"]
        assert len(docs) == 3
        assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
        assert isinstance(docs[0].content, str)
        assert docs[0].meta == bytestream.meta
        assert docs[1].meta["file_path"] == str(files[1])
        assert docs[2].meta["file_path"] == str(files[2])

    def test_run_error_handling(self, test_files_path, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = [
            test_files_path / "csv" / "sample_2.csv",
            "non_existing_file.csv",
            test_files_path / "csv" / "sample_3.csv",
        ]
        converter = CSVToDocument()
        with caplog.at_level(logging.WARNING):
            output = converter.run(sources=paths)
            assert "non_existing_file.csv" in caplog.text
        docs = output["documents"]
        assert len(docs) == 2
        assert docs[0].meta["file_path"] == str(paths[0])

    def test_encoding_override(self, test_files_path, caplog):
        """
        Test if the encoding metadata field is used properly
        """
        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
        bytestream.meta["key"] = "value"

        converter = CSVToDocument(encoding="utf-16-le")
        output = converter.run(sources=[bytestream])
        with caplog.at_level(logging.ERROR):
            output = converter.run(sources=[bytestream])
            assert "codec can't decode" in caplog.text

        converter = CSVToDocument(encoding="utf-8")
        output = converter.run(sources=[bytestream])
        assert "Name,Age\r\n" in output["documents"][0].content

    def test_run_with_meta(self):
        bytestream = ByteStream(
            data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n",
            meta={"name": "test_name", "language": "en"},
        )
        converter = CSVToDocument()
        output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
        document = output["documents"][0]

        # check that the metadata from the bytestream is merged with that from the meta parameter
        assert document.meta == {"name": "test_name", "language": "it"}
Convertor: CSVToDocument (#8328) * carry forwarded initial commit * fix: doc strings * fix: update docstrings * fix: docstring update * fix: csv encoding in actions * fix: line endings through hooks * fix: converter docs addition 2024-09-06 14:29:12 +05:30			`# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
			`#`
			`# SPDX-License-Identifier: Apache-2.0`
			`import logging`
			`from unittest.mock import patch`
			`import pandas as pd`
			`from pathlib import Path`

			`import pytest`

			`from haystack.dataclasses import ByteStream`
			`from haystack.components.converters.csv import CSVToDocument`


			`@pytest.fixture`
			`def csv_converter():`
			`return CSVToDocument()`


			`class TestCSVToDocument:`
			`def test_init(self, csv_converter):`
			`assert isinstance(csv_converter, CSVToDocument)`

			`def test_run(self, test_files_path):`
			`"""`
			`Test if the component runs correctly.`
			`"""`
			`bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")`
			`bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")`
			`bytestream.meta["key"] = "value"`
			`files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]`
			`converter = CSVToDocument()`
			`output = converter.run(sources=files)`
			`docs = output["documents"]`
			`assert len(docs) == 3`
			`assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content`
			`assert isinstance(docs[0].content, str)`
			`assert docs[0].meta == bytestream.meta`
			`assert docs[1].meta["file_path"] == str(files[1])`
			`assert docs[2].meta["file_path"] == str(files[2])`

			`def test_run_error_handling(self, test_files_path, caplog):`
			`"""`
			`Test if the component correctly handles errors.`
			`"""`
			`paths = [`
			`test_files_path / "csv" / "sample_2.csv",`
			`"non_existing_file.csv",`
			`test_files_path / "csv" / "sample_3.csv",`
			`]`
			`converter = CSVToDocument()`
			`with caplog.at_level(logging.WARNING):`
			`output = converter.run(sources=paths)`
			`assert "non_existing_file.csv" in caplog.text`
			`docs = output["documents"]`
			`assert len(docs) == 2`
			`assert docs[0].meta["file_path"] == str(paths[0])`

			`def test_encoding_override(self, test_files_path, caplog):`
			`"""`
			`Test if the encoding metadata field is used properly`
			`"""`
			`bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")`
			`bytestream.meta["key"] = "value"`

			`converter = CSVToDocument(encoding="utf-16-le")`
			`output = converter.run(sources=[bytestream])`
			`with caplog.at_level(logging.ERROR):`
			`output = converter.run(sources=[bytestream])`
			`assert "codec can't decode" in caplog.text`

			`converter = CSVToDocument(encoding="utf-8")`
			`output = converter.run(sources=[bytestream])`
			`assert "Name,Age\r\n" in output["documents"][0].content`

			`def test_run_with_meta(self):`
			`bytestream = ByteStream(`
			`data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n",`
			`meta={"name": "test_name", "language": "en"},`
			`)`
			`converter = CSVToDocument()`
			`output = converter.run(sources=[bytestream], meta=[{"language": "it"}])`
			`document = output["documents"][0]`

			`# check that the metadata from the bytestream is merged with that from the meta parameter`
			`assert document.meta == {"name": "test_name", "language": "it"}`