haystack/test/preview/components/file_converters/test_html_to_document.py

import logging

import pytest

from haystack.preview.components.file_converters import HTMLToDocument


class TestHTMLToDocument:
    @pytest.mark.unit
    def test_to_dict(self):
        component = HTMLToDocument()
        data = component.to_dict()
        assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}}

    @pytest.mark.unit
    def test_to_dict_with_custom_init_parameters(self):
        component = HTMLToDocument(id_hash_keys=["name"])
        data = component.to_dict()
        assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}

    @pytest.mark.unit
    def test_from_dict(self):
        data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
        component = HTMLToDocument.from_dict(data)
        assert component.id_hash_keys == ["name"]

    @pytest.mark.unit
    def test_run(self, preview_samples_path):
        """
        Test if the component runs correctly.
        """
        paths = [preview_samples_path / "html" / "what_is_haystack.html"]
        converter = HTMLToDocument()
        output = converter.run(paths=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "Haystack" in docs[0].text

    @pytest.mark.unit
    def test_run_wrong_file_type(self, preview_samples_path, caplog):
        """
        Test if the component runs correctly when an input file is not of the expected type.
        """
        paths = [preview_samples_path / "audio" / "answer.wav"]
        converter = HTMLToDocument()
        with caplog.at_level(logging.WARNING):
            output = converter.run(paths=paths)
            assert "codec can't decode byte" in caplog.text

        docs = output["documents"]
        assert docs == []

    @pytest.mark.unit
    def test_run_error_handling(self, preview_samples_path, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = ["non_existing_file.html"]
        converter = HTMLToDocument()
        with caplog.at_level(logging.WARNING):
            result = converter.run(paths=paths)
            assert "Could not read file non_existing_file.html" in caplog.text
            assert result["documents"] == []
feat: Add HTMLToDocument component (v2) (#5907) 2023-09-28 17:22:28 +02:00			`import logging`

			`import pytest`

			`from haystack.preview.components.file_converters import HTMLToDocument`


			`class TestHTMLToDocument:`
			`@pytest.mark.unit`
			`def test_to_dict(self):`
			`component = HTMLToDocument()`
			`data = component.to_dict()`
			`assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}}`

			`@pytest.mark.unit`
			`def test_to_dict_with_custom_init_parameters(self):`
			`component = HTMLToDocument(id_hash_keys=["name"])`
			`data = component.to_dict()`
			`assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}`

			`@pytest.mark.unit`
			`def test_from_dict(self):`
			`data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}`
			`component = HTMLToDocument.from_dict(data)`
			`assert component.id_hash_keys == ["name"]`

			`@pytest.mark.unit`
			`def test_run(self, preview_samples_path):`
			`"""`
			`Test if the component runs correctly.`
			`"""`
			`paths = [preview_samples_path / "html" / "what_is_haystack.html"]`
			`converter = HTMLToDocument()`
			`output = converter.run(paths=paths)`
			`docs = output["documents"]`
			`assert len(docs) == 1`
			`assert "Haystack" in docs[0].text`

			`@pytest.mark.unit`
			`def test_run_wrong_file_type(self, preview_samples_path, caplog):`
			`"""`
			`Test if the component runs correctly when an input file is not of the expected type.`
			`"""`
			`paths = [preview_samples_path / "audio" / "answer.wav"]`
			`converter = HTMLToDocument()`
			`with caplog.at_level(logging.WARNING):`
			`output = converter.run(paths=paths)`
			`assert "codec can't decode byte" in caplog.text`

			`docs = output["documents"]`
			`assert docs == []`

			`@pytest.mark.unit`
			`def test_run_error_handling(self, preview_samples_path, caplog):`
			`"""`
			`Test if the component correctly handles errors.`
			`"""`
			`paths = ["non_existing_file.html"]`
			`converter = HTMLToDocument()`
			`with caplog.at_level(logging.WARNING):`
			`result = converter.run(paths=paths)`
			`assert "Could not read file non_existing_file.html" in caplog.text`
			`assert result["documents"] == []`