haystack/test/preview/components/file_converters/test_textfile_to_document.py
ZanSara b1daa7c647
chore: migrate to canals==0.7.0 (#5647)
* add default_to_dict and default_from_dict placeholders to ease migration to canals 0.7.0

* canals==0.7.0

* whisper components

* add to_dict/from_dict stubs

* import serialization methods in init to hide canals imports

* reno

* export deserializationerror too

* Update haystack/preview/__init__.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* serialization methods for LocalWhisperTranscriber (#5648)

* chore: serialization methods for `FileExtensionClassifier` (#5651)

* serialization methods for FileExtensionClassifier

* Update test_file_classifier.py

* chore: serialization methods for `SentenceTransformersDocumentEmbedder` (#5652)

* serialization methods for SentenceTransformersDocumentEmbedder

* fix device management

* serialization methods for SentenceTransformersTextEmbedder (#5653)

* serialization methods for TextFileToDocument (#5654)

* chore: serialization methods for `RemoteWhisperTranscriber` (#5650)

* serialization methods for RemoteWhisperTranscriber

* remove patches

* Add default to_dict and from_dict in document stores built with factory (#5674)

* fix tests (#5671)

* chore: simplify serialization methods for `MemoryDocumentStore` (#5667)

* simplify serialization for MemoryDocumentStore

* remove redundant tests

* pylint

* chore: serialization methods for `MemoryRetriever` (#5663)

* serialization method for MemoryRetriever

* more tests

* remove hash from default_document_store_to_dict

* remove diff in factory.py

* chore: serialization methods for `DocumentWriter` (#5661)

* serialization methods for DocumentWriter

* more tests

* use factory

* black

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2023-08-29 18:15:07 +02:00

316 lines
12 KiB
Python

import logging
from unittest.mock import patch
import pytest
from pathlib import Path
from canals.errors import PipelineRuntimeError
from langdetect import LangDetectException
from haystack.preview.components.file_converters.txt import TextFileToDocument
class TestTextfileToDocument:
@pytest.mark.unit
def test_to_dict(self):
component = TextFileToDocument()
data = component.to_dict()
assert data == {
"type": "TextFileToDocument",
"init_parameters": {
"encoding": "utf-8",
"remove_numeric_tables": False,
"numeric_row_threshold": 0.4,
"valid_languages": [],
"id_hash_keys": [],
"progress_bar": True,
},
}
@pytest.mark.unit
def test_to_dict_with_custom_init_parameters(self):
component = TextFileToDocument(
encoding="latin-1",
remove_numeric_tables=True,
numeric_row_threshold=0.7,
valid_languages=["en", "de"],
id_hash_keys=["name"],
progress_bar=False,
)
data = component.to_dict()
assert data == {
"type": "TextFileToDocument",
"init_parameters": {
"encoding": "latin-1",
"remove_numeric_tables": True,
"numeric_row_threshold": 0.7,
"valid_languages": ["en", "de"],
"id_hash_keys": ["name"],
"progress_bar": False,
},
}
@pytest.mark.unit
def test_from_dict(self):
data = {
"type": "TextFileToDocument",
"init_parameters": {
"encoding": "latin-1",
"remove_numeric_tables": True,
"numeric_row_threshold": 0.7,
"valid_languages": ["en", "de"],
"id_hash_keys": ["name"],
"progress_bar": False,
},
}
component = TextFileToDocument.from_dict(data)
assert component.encoding == "latin-1"
assert component.remove_numeric_tables
assert component.numeric_row_threshold == 0.7
assert component.valid_languages == ["en", "de"]
assert component.id_hash_keys == ["name"]
assert not component.progress_bar
@pytest.mark.unit
def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
converter = TextFileToDocument()
output = converter.run(paths=paths)
docs = output["documents"]
assert len(docs) == 2
assert docs[0].content == "Some text for testing.\nTwo lines in here."
assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321."
assert docs[0].metadata["file_path"] == str(paths[0])
assert docs[1].metadata["file_path"] == str(paths[1])
@pytest.mark.unit
def test_run_warning_for_invalid_language(self, preview_samples_path, caplog):
file_path = preview_samples_path / "txt" / "doc_1.txt"
converter = TextFileToDocument()
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
with caplog.at_level(logging.WARNING):
output = converter.run(paths=[file_path], valid_languages=["de"])
assert (
f"Text from file {file_path} is not in one of the valid languages: ['de']. "
f"The file may have been decoded incorrectly." in caplog.text
)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content == "Some text for testing.\nTwo lines in here."
@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
"""
Test if the component correctly handles errors.
"""
paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"]
converter = TextFileToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(paths=paths)
assert (
"Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist."
in caplog.text
)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].metadata["file_path"] == str(paths[0])
@pytest.mark.unit
def test_prepare_metadata_no_metadata(self):
"""
Test if the metadata is correctly prepared when no custom metadata is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
metadata=None, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
@pytest.mark.unit
def test_prepare_metadata_single_dict(self):
"""
Test if the metadata is correctly prepared when a single dict is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
metadata={"name": "test"}, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
assert meta[0]["name"] == "test"
assert meta[1]["name"] == "test"
@pytest.mark.unit
def test_prepare_metadata_list_of_dicts(self):
"""
Test if the metadata is correctly prepared when a list of dicts is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
metadata=[{"name": "test1"}, {"name": "test2"}],
paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")],
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
assert meta[0]["name"] == "test1"
assert meta[1]["name"] == "test2"
@pytest.mark.unit
def test_prepare_metadata_unmatching_list_len(self):
"""
Test if an error is raised when the number of metadata dicts is not equal to the number of
file paths.
"""
converter = TextFileToDocument()
with pytest.raises(
PipelineRuntimeError,
match="The number of metadata entries must match the number of paths if metadata is a list.",
):
converter._prepare_metadata(
metadata=[{"name": "test1"}, {"name": "test2"}],
paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"],
)
@pytest.mark.unit
def test_read_and_clean_file(self, preview_samples_path):
"""
Test if the file is correctly read.
"""
file_path = preview_samples_path / "txt" / "doc_1.txt"
converter = TextFileToDocument()
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
assert text == "Some text for testing.\nTwo lines in here."
@pytest.mark.unit
def test_read_and_clean_file_non_existing_file(self):
"""
Test if an error is raised when the file does not exist.
"""
converter = TextFileToDocument()
file_path = "non_existing_file.txt"
with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."):
converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
@pytest.mark.unit
def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path):
"""
Test if the file is correctly read and numeric tables are removed.
"""
file_path = preview_samples_path / "txt" / "doc_2.txt"
converter = TextFileToDocument()
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True)
assert text == "This is a test line.\n987 654 321."
@pytest.mark.unit
def test_clean_page_without_remove_numeric_tables(self):
"""
Test if the page is not changed when remove_numeric_tables is False.
"""
converter = TextFileToDocument()
page = "This is a test line.\n123 456 789"
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False)
assert cleaned_page == page
@pytest.mark.unit
def test_clean_page_with_remove_numeric_tables(self):
"""
Test if the page is correctly cleaned when remove_numeric_tables is True.
"""
converter = TextFileToDocument()
page = "This is a test line.\n123 456 789"
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True)
assert cleaned_page == "This is a test line."
@pytest.mark.unit
def test_is_numeric_row_only_numbers(self):
"""
Test if the line is correctly identified as a numeric row when it only contains numbers.
"""
converter = TextFileToDocument()
line = "123 456 789"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_only_text(self):
"""
Test if the line is correctly identified as a non-numeric row when it only contains text.
"""
converter = TextFileToDocument()
line = "This is a test line."
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_only_numbers_with_period(self):
"""
Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at
the end.
"""
converter = TextFileToDocument()
line = "123 456 789."
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_more_numbers_than_text(self):
"""
Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than.
"""
converter = TextFileToDocument()
line = "123 456 789 This is a test"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_less_numbers_than_text(self):
"""
Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than.
"""
converter = TextFileToDocument()
line = "123 456 789 This is a test line"
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_words_consist_of_numbers_and_text(self):
"""
Test if the line is correctly identified as a numeric row when the words consist of numbers and text.
"""
converter = TextFileToDocument()
line = "123eur 456usd"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_validate_language(self):
"""
Test if the language is correctly validated.
"""
converter = TextFileToDocument()
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
assert converter._validate_language(text="This is an english text.", valid_languages=["en"])
assert not converter._validate_language(text="This is an english text.", valid_languages=["de"])
@pytest.mark.unit
def test_validate_language_no_languages_specified(self):
"""
Test if _validate_languages returns True when no languages are specified.
"""
converter = TextFileToDocument()
assert converter._validate_language(text="This is an english test.", valid_languages=[])
@pytest.mark.unit
def test_validate_language_lang_detect_exception(self):
"""
Test if _validate_languages returns False when langdetect throws an exception.
"""
converter = TextFileToDocument()
with patch(
"haystack.preview.components.file_converters.txt.langdetect.detect",
side_effect=LangDetectException(code=0, message="Test"),
):
assert not converter._validate_language(text="This is an english text.", valid_languages=["en"])