2023-10-31 15:35:05 +01:00
|
|
|
import logging
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from haystack.preview import Document
|
2023-11-06 12:00:01 +01:00
|
|
|
from haystack.preview.components.classifiers import DocumentLanguageClassifier
|
2023-10-31 15:35:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
class TestDocumentLanguageClassifier:
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_init(self):
|
|
|
|
component = DocumentLanguageClassifier()
|
|
|
|
assert component.languages == ["en"]
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_non_document_input(self):
|
|
|
|
with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
|
|
|
|
classifier = DocumentLanguageClassifier()
|
|
|
|
classifier.run(documents="This is an english sentence.")
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_single_document(self):
|
|
|
|
with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
|
|
|
|
classifier = DocumentLanguageClassifier()
|
|
|
|
classifier.run(documents=Document(content="This is an english sentence."))
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_empty_list(self):
|
|
|
|
classifier = DocumentLanguageClassifier()
|
|
|
|
result = classifier.run(documents=[])
|
2023-11-15 13:10:07 +01:00
|
|
|
assert result == {"documents": []}
|
2023-10-31 15:35:05 +01:00
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_detect_language(self):
|
|
|
|
classifier = DocumentLanguageClassifier()
|
|
|
|
detected_language = classifier.detect_language(Document(content="This is an english sentence."))
|
|
|
|
assert detected_language == "en"
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
2023-11-15 13:10:07 +01:00
|
|
|
def test_classify_as_en_and_unmatched(self):
|
2023-10-31 15:35:05 +01:00
|
|
|
classifier = DocumentLanguageClassifier()
|
|
|
|
english_document = Document(content="This is an english sentence.")
|
|
|
|
german_document = Document(content="Ein deutscher Satz ohne Verb.")
|
|
|
|
result = classifier.run(documents=[english_document, german_document])
|
2023-11-15 13:10:07 +01:00
|
|
|
assert result["documents"][0].meta["language"] == "en"
|
|
|
|
assert result["documents"][1].meta["language"] == "unmatched"
|
2023-10-31 15:35:05 +01:00
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_warning_if_no_language_detected(self, caplog):
|
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
classifier = DocumentLanguageClassifier()
|
|
|
|
classifier.run(documents=[Document(content=".")])
|
|
|
|
assert "Langdetect cannot detect the language of Document with id" in caplog.text
|