haystack/test/preview/components/preprocessors/test_document_language_classifier.py
Julian Risch 29b1fefaa4
feat: Add DocumentLanguageClassifier 2.0 (#6037)
* add DocumentLanguageClassifier and tests

* reno

* fix import, rename DocumentCleaner

* mark example usage as python code

* add assertions to e2e test

* use deserialized document_store

* Apply suggestions from code review

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* remove from/to_dict

* use renamed InMemoryDocumentStore

* adapt to Document refactoring

* improve docstring

* fix test for new Document

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
2023-10-31 15:35:05 +01:00

52 lines
2.1 KiB
Python

import logging
import pytest
from haystack.preview import Document
from haystack.preview.components.preprocessors import DocumentLanguageClassifier
class TestDocumentLanguageClassifier:
@pytest.mark.unit
def test_init(self):
component = DocumentLanguageClassifier()
assert component.languages == ["en"]
@pytest.mark.unit
def test_non_document_input(self):
with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
classifier = DocumentLanguageClassifier()
classifier.run(documents="This is an english sentence.")
@pytest.mark.unit
def test_single_document(self):
with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
classifier = DocumentLanguageClassifier()
classifier.run(documents=Document(content="This is an english sentence."))
@pytest.mark.unit
def test_empty_list(self):
classifier = DocumentLanguageClassifier()
result = classifier.run(documents=[])
assert result == {"en": [], "unmatched": []}
@pytest.mark.unit
def test_detect_language(self):
classifier = DocumentLanguageClassifier()
detected_language = classifier.detect_language(Document(content="This is an english sentence."))
assert detected_language == "en"
@pytest.mark.unit
def test_route_to_en_and_unmatched(self):
classifier = DocumentLanguageClassifier()
english_document = Document(content="This is an english sentence.")
german_document = Document(content="Ein deutscher Satz ohne Verb.")
result = classifier.run(documents=[english_document, german_document])
assert result == {"en": [english_document], "unmatched": [german_document]}
@pytest.mark.unit
def test_warning_if_no_language_detected(self, caplog):
with caplog.at_level(logging.WARNING):
classifier = DocumentLanguageClassifier()
classifier.run(documents=[Document(content=".")])
assert "Langdetect cannot detect the language of Document with id" in caplog.text