haystack/test/components/converters/test_textfile_to_document.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import logging
import os

import pytest

from haystack.dataclasses import ByteStream
from haystack.components.converters.txt import TextFileToDocument


class TestTextfileToDocument:
    def test_run(self, test_files_path):
        """
        Test if the component runs correctly.
        """
        bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")
        bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")
        bytestream.meta["key"] = "value"
        files = [str(test_files_path / "txt" / "doc_1.txt"), test_files_path / "txt" / "doc_2.txt", bytestream]
        converter = TextFileToDocument()
        output = converter.run(sources=files)
        docs = output["documents"]
        assert len(docs) == 3
        assert "Some text for testing." in docs[0].content
        assert "This is a test line." in docs[1].content
        assert "That's yet another file!" in docs[2].content
        assert docs[0].meta["file_path"] == os.path.basename(files[0])
        assert docs[1].meta["file_path"] == os.path.basename(files[1])
        assert docs[2].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"}

    def test_run_with_store_full_path(self, test_files_path):
        """
        Test if the component runs correctly with store_full_path= False.
        """
        bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")
        bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")
        bytestream.meta["key"] = "value"
        files = [str(test_files_path / "txt" / "doc_1.txt"), bytestream]
        converter = TextFileToDocument(store_full_path=False)
        output = converter.run(sources=files)
        docs = output["documents"]
        assert len(docs) == 2
        assert "Some text for testing." in docs[0].content
        assert "That's yet another file!" in docs[1].content
        assert docs[0].meta["file_path"] == "doc_1.txt"
        assert docs[1].meta["file_path"] == "doc_3.txt"

    def test_run_error_handling(self, test_files_path, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = [test_files_path / "txt" / "doc_1.txt", "non_existing_file.txt", test_files_path / "txt" / "doc_3.txt"]
        converter = TextFileToDocument()
        with caplog.at_level(logging.WARNING):
            output = converter.run(sources=paths)
            assert "non_existing_file.txt" in caplog.text
        docs = output["documents"]
        assert len(docs) == 2
        assert docs[0].meta["file_path"] == os.path.basename(paths[0])
        assert docs[1].meta["file_path"] == os.path.basename(paths[2])

    def test_encoding_override(self, test_files_path):
        """
        Test if the encoding metadata field is used properly
        """
        bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_1.txt")
        bytestream.meta["key"] = "value"

        converter = TextFileToDocument(encoding="utf-16")
        output = converter.run(sources=[bytestream])
        assert "Some text for testing." not in output["documents"][0].content

        bytestream.meta["encoding"] = "utf-8"
        output = converter.run(sources=[bytestream])
        assert "Some text for testing." in output["documents"][0].content

    def test_run_with_meta(self):
        bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

        converter = TextFileToDocument()

        output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
        document = output["documents"][0]

        # check that the metadata from the bytestream is merged with that from the meta parameter
        assert document.meta == {"author": "test_author", "language": "it"}
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
			`#`
			`# SPDX-License-Identifier: Apache-2.0`
chore: removing unused imports from tests (#9444) 2025-05-26 14:41:36 +01:00
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`import logging`
update default value of 'store_full_path' to False in converters (#8619) 2024-12-10 16:03:38 +01:00			`import os`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00
			`import pytest`

Fix all tests 2023-11-24 14:48:43 +01:00			`from haystack.dataclasses import ByteStream`
			`from haystack.components.converters.txt import TextFileToDocument`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00

Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`class TestTextfileToDocument:`
Fix all tests 2023-11-24 14:48:43 +01:00			`def test_run(self, test_files_path):`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`"""`
			`Test if the component runs correctly.`
			`"""`
Fix all tests 2023-11-24 14:48:43 +01:00			`bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")`
changed metadata to meta (#6605) 2023-12-21 17:09:58 +05:30			`bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")`
			`bytestream.meta["key"] = "value"`
Fix all tests 2023-11-24 14:48:43 +01:00			`files = [str(test_files_path / "txt" / "doc_1.txt"), test_files_path / "txt" / "doc_2.txt", bytestream]`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`converter = TextFileToDocument()`
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`output = converter.run(sources=files)`
Migrate existing v2 components to Canals 0.4.0 (#5532) * pin canals==0.4.0 * update audio components * allow audio components to receive whisper_params in init too * migrating memoryretriever * migrate memoryretriever * migrate TextFileToDocument * fix TextFileToDocument tests * fix pipeline tests * fix defaults management * reno * inverted assignments * Simplify release notes --------- Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2023-08-09 15:51:32 +02:00			`docs = output["documents"]`
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`assert len(docs) == 3`
			`assert "Some text for testing." in docs[0].content`
			`assert "This is a test line." in docs[1].content`
			`assert "That's yet another file!" in docs[2].content`
update default value of 'store_full_path' to False in converters (#8619) 2024-12-10 16:03:38 +01:00			`assert docs[0].meta["file_path"] == os.path.basename(files[0])`
			`assert docs[1].meta["file_path"] == os.path.basename(files[1])`
			`assert docs[2].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"}`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00
feat: Add store_full_path to converters (2/3) (#8573) 2024-11-25 15:22:19 +05:00			`def test_run_with_store_full_path(self, test_files_path):`
			`"""`
			`Test if the component runs correctly with store_full_path= False.`
			`"""`
			`bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")`
			`bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")`
			`bytestream.meta["key"] = "value"`
			`files = [str(test_files_path / "txt" / "doc_1.txt"), bytestream]`
			`converter = TextFileToDocument(store_full_path=False)`
			`output = converter.run(sources=files)`
			`docs = output["documents"]`
			`assert len(docs) == 2`
			`assert "Some text for testing." in docs[0].content`
			`assert "That's yet another file!" in docs[1].content`
			`assert docs[0].meta["file_path"] == "doc_1.txt"`
			`assert docs[1].meta["file_path"] == "doc_3.txt"`

Fix all tests 2023-11-24 14:48:43 +01:00			`def test_run_error_handling(self, test_files_path, caplog):`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`"""`
			`Test if the component correctly handles errors.`
			`"""`
Fix all tests 2023-11-24 14:48:43 +01:00			`paths = [test_files_path / "txt" / "doc_1.txt", "non_existing_file.txt", test_files_path / "txt" / "doc_3.txt"]`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`converter = TextFileToDocument()`
			`with caplog.at_level(logging.WARNING):`
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`output = converter.run(sources=paths)`
			`assert "non_existing_file.txt" in caplog.text`
Migrate existing v2 components to Canals 0.4.0 (#5532) * pin canals==0.4.0 * update audio components * allow audio components to receive whisper_params in init too * migrating memoryretriever * migrate memoryretriever * migrate TextFileToDocument * fix TextFileToDocument tests * fix pipeline tests * fix defaults management * reno * inverted assignments * Simplify release notes --------- Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2023-08-09 15:51:32 +02:00			`docs = output["documents"]`
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`assert len(docs) == 2`
update default value of 'store_full_path' to False in converters (#8619) 2024-12-10 16:03:38 +01:00			`assert docs[0].meta["file_path"] == os.path.basename(paths[0])`
			`assert docs[1].meta["file_path"] == os.path.basename(paths[2])`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00
Fix all tests 2023-11-24 14:48:43 +01:00			`def test_encoding_override(self, test_files_path):`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`"""`
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`Test if the encoding metadata field is used properly`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00			`"""`
Fix all tests 2023-11-24 14:48:43 +01:00			`bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_1.txt")`
changed metadata to meta (#6605) 2023-12-21 17:09:58 +05:30			`bytestream.meta["key"] = "value"`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`converter = TextFileToDocument(encoding="utf-16")`
			`output = converter.run(sources=[bytestream])`
			`assert "Some text for testing." not in output["documents"][0].content`
feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2023-08-01 11:34:52 +02:00
changed metadata to meta (#6605) 2023-12-21 17:09:58 +05:30			`bytestream.meta["encoding"] = "utf-8"`
Standardize `TextFileToDocument` (#6232) * simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests 2023-11-17 14:39:39 +00:00			`output = converter.run(sources=[bytestream])`
			`assert "Some text for testing." in output["documents"][0].content`
refactor!: Converters - standardize inputs (#6540) * standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-12-15 16:41:35 +01:00
			`def test_run_with_meta(self):`
changed metadata to meta (#6605) 2023-12-21 17:09:58 +05:30			`bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})`
refactor!: Converters - standardize inputs (#6540) * standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-12-15 16:41:35 +01:00
			`converter = TextFileToDocument()`

			`output = converter.run(sources=[bytestream], meta=[{"language": "it"}])`
			`document = output["documents"][0]`

			`# check that the metadata from the bytestream is merged with that from the meta parameter`
			`assert document.meta == {"author": "test_author", "language": "it"}`