feat: LanguageClassifier (#2994)

* add lanaguage classifier node * Fix a few bugs and general code style * whitespace * first draft and refactoring * draft of classes separation * improve base class * fix inivisible character; add some tests * fix and more tests * more docs and tests * move __init__ to base * add transformers node; improve tests * incorporate feedback; little fix to other node * labels_to_languages mapping * better docstrings * use logger instead of logging --------- Co-authored-by: Stanislav Zamecnik <stanislav.zamecnik@telekom.com> Co-authored-by: anakin87 <44616784+anakin87@users.noreply.github.com> Co-authored-by: stazam <zamecnik.stanislav@gmail.com>
2025-11-01 18:29:32 +00:00 · 2023-03-13 10:30:03 +01:00 · 2023-03-13 10:30:03 +01:00 · fd3f3143d4
commit fd3f3143d4
parent 405aee0cfa
5 changed files with 532 additions and 0 deletions
--- a/haystack/nodes/doc_language_classifier/init.py
+++ b/haystack/nodes/doc_language_classifier/init.py
@ -0,0 +1,2 @@
+from haystack.nodes.doc_language_classifier.langdetect import LangdetectDocumentLanguageClassifier
+from haystack.nodes.doc_language_classifier.transformers import TransformersDocumentLanguageClassifier
--- a/haystack/nodes/doc_language_classifier/base.py
+++ b/haystack/nodes/doc_language_classifier/base.py
@ -0,0 +1,137 @@
+import logging
+from abc import abstractmethod
+from typing import Dict, List, Optional, Tuple, Any
+
+from haystack.nodes.base import BaseComponent, Document
+
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_LANGUAGES = ["en", "de", "es", "cs", "nl"]
+
+
+class BaseDocumentLanguageClassifier(BaseComponent):
+    """
+    Abstract class for Document Language Classifiers
+    """
+
+    outgoing_edges = len(DEFAULT_LANGUAGES)
+
+    @classmethod
+    def _calculate_outgoing_edges(cls, component_params: Dict[str, Any]) -> int:
+        route_by_language = component_params.get("route_by_language", True)
+        if route_by_language is False:
+            return 1
+        languages_to_route = component_params.get("languages_to_route", DEFAULT_LANGUAGES)
+        return len(languages_to_route)
+
+    def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
+        """
+        :param route_by_language: whether to send Documents on a different output edge depending on their language.
+        :param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
+        """
+        super().__init__()
+
+        if languages_to_route is None:
+            languages_to_route = DEFAULT_LANGUAGES
+            if route_by_language is True:
+                logger.info(
+                    "languages_to_route list has not been defined. The default list will be used: %s",
+                    languages_to_route,
+                )
+
+        if len(set(languages_to_route)) != len(languages_to_route):
+            duplicates = {lang for lang in languages_to_route if languages_to_route.count(lang) > 1}
+            raise ValueError(f"languages_to_route parameter can't contain duplicate values ({duplicates}).")
+
+        self.route_by_language = route_by_language
+        self.languages_to_route = languages_to_route
+
+    @abstractmethod
+    def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
+        pass
+
+    @abstractmethod
+    def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
+        pass
+
+    def _get_edge_from_language(self, language: str) -> str:
+        return f"output_{self.languages_to_route.index(language) + 1}"
+
+    def run(self, documents: List[Document]) -> Tuple[Dict[str, List[Document]], str]:  # type: ignore
+        """
+        Run language document classifier on a list of documents.
+
+        :param documents: list of documents to detect language.
+        """
+        docs_with_languages = self.predict(documents=documents)
+        output = {"documents": docs_with_languages}
+
+        if self.route_by_language is False:
+            return output, "output_1"
+
+        # self.route_by_language is True
+        languages = [doc.meta["language"] for doc in docs_with_languages]
+        unique_languages = list(set(languages))
+        if len(unique_languages) > 1:
+            raise ValueError(
+                f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
+                "If you want to route documents by language, you can call Pipeline.run() once for each Document."
+            )
+        language = unique_languages[0]
+        if language is None:
+            logger.warning(
+                "The model cannot detect the language of any of the documents."
+                "The first language in the list of supported languages will be used to route the document: %s",
+                self.languages_to_route[0],
+            )
+            language = self.languages_to_route[0]
+        if language not in self.languages_to_route:
+            raise ValueError(
+                f"'{language}' is not in the list of languages to route ({', '.join(self.languages_to_route)})."
+                f"You should specify them when initializing the node, using the parameter languages_to_route."
+            )
+        return output, self._get_edge_from_language(str(language))
+
+    def run_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> Tuple[Dict, str]:  # type: ignore
+        """
+        Run language document classifier on batches of documents.
+
+        :param documents: list of lists of documents to detect language.
+        """
+        docs_lists_with_languages = self.predict_batch(documents=documents, batch_size=batch_size)
+
+        if self.route_by_language is False:
+            output = {"documents": docs_lists_with_languages}
+            return output, "output_1"
+
+        # self.route_by_language is True
+        split: Dict[str, Dict[str, List[List[Document]]]] = {
+            f"output_{pos}": {"documents": []} for pos in range(1, len(self.languages_to_route) + 1)
+        }
+
+        for docs_list in docs_lists_with_languages:
+            languages = [doc.meta["language"] for doc in docs_list]
+            unique_languages = list(set(languages))
+            if len(unique_languages) > 1:
+                raise ValueError(
+                    f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
+                    "If you want to route documents by language, you can call Pipeline.run() once for each Document."
+                )
+            if unique_languages[0] is None:
+                logger.warning(
+                    "The model cannot detect the language of some of the documents."
+                    "The first language in the list of supported languages will be used to route the document: %s",
+                    self.languages_to_route[0],
+                )
+                language: Optional[str] = self.languages_to_route[0]
+            language = unique_languages[0]
+            if language not in self.languages_to_route:
+                raise ValueError(
+                    f"'{language}' is not in the list of languages to route ({', '.join(self.languages_to_route)})."
+                    f"You should specify them when initializing the node, using the parameter languages_to_route."
+                )
+
+            edge_name = self._get_edge_from_language(str(language))
+            split[edge_name]["documents"].append(docs_list)
+        return split, "split"
--- a/haystack/nodes/doc_language_classifier/langdetect.py
+++ b/haystack/nodes/doc_language_classifier/langdetect.py
@ -0,0 +1,94 @@
+import logging
+from typing import List, Optional
+
+from langdetect import LangDetectException, detect
+
+from haystack.nodes.base import Document
+from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClassifier
+
+logger = logging.getLogger(__name__)
+
+
+class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
+    """
+    Node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for document language classification.
+    This node detects the languge of Documents and adds the output to the Documents metadata.
+    The meta field of the Document is a dictionary with the following format:
+    ``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
+    - Using the document language classifier, you can directly get predictions via predict()
+    - You can flow the Documents to different branches depending on their language,
+      by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
+    **Usage example**
+    ```python
+    ...
+    docs = [Document(content="The black dog runs across the meadow")]
+
+    doclangclassifier = LangdetectDocumentLanguageClassifier()
+    results = doclangclassifier.predict(documents=docs)
+
+    # print the predicted language
+    print(results[0].to_dict()["meta"]["language"]
+
+    **Usage example for routing**
+    ```python
+    ...
+    docs = [Document(content="My name is Ryan and I live in London"),
+            Document(content="Mi chiamo Matteo e vivo a Roma")]
+
+    doclangclassifier = LangdetectDocumentLanguageClassifier(
+        route_by_language = True,
+        languages_to_route = ['en','it','es']
+        )
+    for doc in docs:
+        doclangclassifier.run(doc)
+    ```
+    """
+
+    def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
+        """
+        :param route_by_language: whether to send Documents on a different output edge depending on their language.
+        :param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
+        """
+        super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
+
+    def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
+        """
+        Detect the languge of Documents and add the output to the Documents metadata.
+        :param documents: list of Documents to detect language.
+        :return: List of Documents, where Document.meta["language"] contains the predicted language
+        """
+        if len(documents) == 0:
+            raise ValueError(
+                "LangdetectDocumentLanguageClassifier needs at least one document to predict the language."
+            )
+        if batch_size is not None:
+            logger.warning(
+                "LangdetectDocumentLanguageClassifier does not support batch_size. This parameter is ignored."
+            )
+
+        documents_with_language = []
+        for document in documents:
+            try:
+                language = detect(document.content)
+            except LangDetectException:
+                logger.warning("Langdetect cannot detect the language of document: %s", document)
+                language = None
+            document.meta["language"] = language
+            documents_with_language.append(document)
+        return documents_with_language
+
+    def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
+        """
+        Detect the documents language and add the output to the document's meta data.
+        :param documents: list of lists of Documents to detect language.
+        :return: List of lists of Documents, where Document.meta["language"] contains the predicted language
+        """
+        if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):
+            raise ValueError(
+                "LangdetectDocumentLanguageClassifier needs at least one document to predict the language."
+            )
+        if batch_size is not None:
+            logger.warning(
+                "LangdetectDocumentLanguageClassifier does not support batch_size. This parameter is ignored."
+            )
+        return [self.predict(documents=docs_list) for docs_list in documents]
--- a/haystack/nodes/doc_language_classifier/transformers.py
+++ b/haystack/nodes/doc_language_classifier/transformers.py
@ -0,0 +1,178 @@
+import logging
+from typing import List, Optional, Union, Dict
+import itertools
+
+import torch
+from tqdm.auto import tqdm
+from transformers import pipeline
+
+from haystack.nodes.base import Document
+from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClassifier
+from haystack.modeling.utils import initialize_device_settings
+
+logger = logging.getLogger(__name__)
+
+
+class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
+    """
+    Transformer based model for document language classification using the HuggingFace's transformers framework
+    (https://github.com/huggingface/transformers).
+    While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
+    This node detects the languge of Documents and adds the output to the Documents metadata.
+    The meta field of the Document is a dictionary with the following format:
+    ``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
+    - Using the document language classifier, you can directly get predictions via predict()
+    - You can flow the Documents to different branches depending on their language,
+      by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
+    **Usage example**
+    ```python
+    ...
+    docs = [Document(content="The black dog runs across the meadow")]
+
+    doclangclassifier = TransformersDocumentLanguageClassifier()
+    results = doclangclassifier.predict(documents=docs)
+
+    # print the predicted language
+    print(results[0].to_dict()["meta"]["language"]
+
+    **Usage example for routing**
+    ```python
+    ...
+    docs = [Document(content="My name is Ryan and I live in London"),
+            Document(content="Mi chiamo Matteo e vivo a Roma")]
+
+    doclangclassifier = TransformersDocumentLanguageClassifier(
+        route_by_language = True,
+        languages_to_route = ['en','it','es']
+        )
+    for doc in docs:
+        doclangclassifier.run(doc)
+    ```
+    """
+
+    def __init__(
+        self,
+        route_by_language: bool = True,
+        languages_to_route: Optional[List[str]] = None,
+        labels_to_languages_mapping: Optional[Dict[str, str]] = None,
+        model_name_or_path: str = "papluca/xlm-roberta-base-language-detection",
+        model_version: Optional[str] = None,
+        tokenizer: Optional[str] = None,
+        use_gpu: bool = True,
+        batch_size: int = 16,
+        progress_bar: bool = True,
+        use_auth_token: Optional[Union[str, bool]] = None,
+        devices: Optional[List[Union[str, torch.device]]] = None,
+    ):
+        """
+        Load a language detection model from Transformers.
+        See https://huggingface.co/models for full list of available models.
+        Language detection models: https://huggingface.co/models?search=language%20detection
+
+        :param route_by_language: whether to send Documents on a different output edge depending on their language.
+        :param languages_to_route: list of languages, each corresponding to a different output edge (for the list of the supported languages, see the model card of the chosen model).
+        :param labels_to_languages_mapping: some Transformers models do not return language names but generic labels. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}.
+
+        :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'papluca/xlm-roberta-base-language-detection'.
+        See https://huggingface.co/models for full list of available models.
+        :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+        :param tokenizer: Name of the tokenizer (usually the same as model)
+        :param use_gpu: Whether to use GPU (if available).
+        :param batch_size: Number of Documents to be processed at a time.
+        :param progress_bar: Whether to show a progress bar while processing.
+        :param use_auth_token: The API token used to download private models from Huggingface.
+                               If this parameter is set to `True`, then the token generated when running
+                               `transformers-cli login` (stored in ~/.huggingface) will be used.
+                               Additional information can be found here
+                               https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+        :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+                        A list containing torch device objects and/or strings is supported (For example
+                        [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+                        parameter is not used and a single cpu device is used for inference.
+
+        """
+        super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
+
+        resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+        if len(resolved_devices) > 1:
+            logger.warning(
+                "Multiple devices are not supported in %s inference, using the first device %s.",
+                self.__class__.__name__,
+                resolved_devices[0],
+            )
+        if tokenizer is None:
+            tokenizer = model_name_or_path
+
+        self.model = pipeline(
+            task="text-classification",
+            model=model_name_or_path,
+            tokenizer=tokenizer,
+            device=resolved_devices[0],
+            revision=model_version,
+            top_k=1,
+            use_auth_token=use_auth_token,
+        )
+        self.batch_size = batch_size
+        self.progress_bar = progress_bar
+        self.labels_to_languages_mapping = labels_to_languages_mapping or {}
+
+    def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
+        """
+        Detect the languge of Documents and add the output to the Documents metadata.
+        :param documents: list of Documents to detect language.
+        :param batch_size: The number of Documents to classify at a time.
+        :return: List of Documents, where Document.meta["language"] contains the predicted language
+        """
+        if len(documents) == 0:
+            raise ValueError(
+                "TransformersDocumentLanguageClassifier needs at least one document to predict the language."
+            )
+        if batch_size is None:
+            batch_size = self.batch_size
+
+        texts = [doc.content for doc in documents]
+        batches = self._get_batches(texts, batch_size=batch_size)
+        predictions = []
+        pb = tqdm(total=len(texts), disable=not self.progress_bar, desc="Predicting the language of documents")
+        for batch in batches:
+            batched_prediction = self.model(batch, top_k=1, truncation=True)
+            predictions.extend(batched_prediction)
+            pb.update(len(batch))
+        pb.close()
+        for prediction, doc in zip(predictions, documents):
+            label = prediction[0]["label"]
+            # replace the label with the language, if present in the mapping
+            language = self.labels_to_languages_mapping.get(label, label)
+            doc.meta["language"] = language
+        return documents
+
+    def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
+        """
+        Detect the documents language and add the output to the document's meta data.
+        :param documents: list of lists of Documents to detect language.
+        :return: List of lists of Documents, where Document.meta["language"] contains the predicted language
+        """
+        if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):
+            raise ValueError(
+                "TransformersDocumentLanguageClassifier needs at least one document to predict the language."
+            )
+        if batch_size is None:
+            batch_size = self.batch_size
+
+        flattened_documents = list(itertools.chain.from_iterable(documents))
+        docs_with_preds = self.predict(flattened_documents, batch_size=batch_size)
+
+        # Group documents together
+        grouped_documents = []
+        for docs_list in documents:
+            grouped_documents.append(docs_with_preds[: len(docs_list)])
+            docs_with_preds = docs_with_preds[len(docs_list) :]
+
+        return grouped_documents
+
+    def _get_batches(self, items, batch_size):
+        if batch_size is None:
+            yield items
+            return
+        for index in range(0, len(items), batch_size):
+            yield items[index : index + batch_size]
--- a/test/nodes/test_doc_language_classifier.py
+++ b/test/nodes/test_doc_language_classifier.py
@ -0,0 +1,121 @@
+import pytest
+import logging
+
+from haystack.schema import Document
+from haystack.nodes.doc_language_classifier import (
+    LangdetectDocumentLanguageClassifier,
+    TransformersDocumentLanguageClassifier,
+)
+
+LANGUAGES_TO_ROUTE = ["en", "es", "it"]
+DOCUMENTS = [
+    Document(content="My name is Matteo and I live in Rome"),
+    Document(content="Mi chiamo Matteo e vivo a Roma"),
+    Document(content="Mi nombre es Matteo y vivo en Roma"),
+]
+
+EXPECTED_LANGUAGES = ["en", "it", "es"]
+EXPECTED_OUTPUT_EDGES = ["output_1", "output_3", "output_2"]
+
+
+@pytest.fixture(params=["langdetect", "transformers"])
+def doclangclassifier(request):
+    if request.param == "langdetect":
+        return LangdetectDocumentLanguageClassifier(route_by_language=True, languages_to_route=LANGUAGES_TO_ROUTE)
+    elif request.param == "transformers":
+        return TransformersDocumentLanguageClassifier(
+            route_by_language=True,
+            languages_to_route=LANGUAGES_TO_ROUTE,
+            model_name_or_path="jb2k/bert-base-multilingual-cased-language-detection",
+            labels_to_languages_mapping={"LABEL_11": "en", "LABEL_22": "it", "LABEL_38": "es"},
+        )
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_predict(doclangclassifier):
+    results = doclangclassifier.predict(documents=DOCUMENTS)
+    for doc, expected_language in zip(results, EXPECTED_LANGUAGES):
+        assert doc.to_dict()["meta"]["language"] == expected_language
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["transformers"], indirect=True)
+def test_transformers_doclangclassifier_predict_wo_mapping(doclangclassifier):
+    doclangclassifier.labels_to_languages_mapping = {}
+    expected_labels = ["LABEL_11", "LABEL_22", "LABEL_38"]
+    results = doclangclassifier.predict(documents=DOCUMENTS)
+    for doc, expected_label in zip(results, expected_labels):
+        assert doc.to_dict()["meta"]["language"] == expected_label
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_predict_batch(doclangclassifier):
+    results = doclangclassifier.predict_batch(documents=[DOCUMENTS, DOCUMENTS[:2]])
+    expected_languages = [EXPECTED_LANGUAGES, EXPECTED_LANGUAGES[:2]]
+    for lst_docs, lst_expected_languages in zip(results, expected_languages):
+        for doc, expected_language in zip(lst_docs, lst_expected_languages):
+            assert doc.to_dict()["meta"]["language"] == expected_language
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_run_not_route(doclangclassifier):
+    doclangclassifier.route_by_language = False
+    results, edge = doclangclassifier.run(documents=DOCUMENTS)
+    assert edge == "output_1"
+    for doc, expected_language in zip(results["documents"], EXPECTED_LANGUAGES):
+        assert doc.to_dict()["meta"]["language"] == expected_language
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_run_route(doclangclassifier):
+    for doc, expected_language, expected_edge in zip(DOCUMENTS, EXPECTED_LANGUAGES, EXPECTED_OUTPUT_EDGES):
+        result, edge = doclangclassifier.run(documents=[doc])
+        document = result["documents"][0]
+
+        assert edge == expected_edge
+        assert document.to_dict()["meta"]["language"] == expected_language
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_run_route_fail_on_mixed_languages(doclangclassifier):
+    with pytest.raises(ValueError, match="Documents of multiple languages"):
+        doclangclassifier.run(documents=DOCUMENTS)
+
+
+# not testing transformers because current models always predict a language
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect"], indirect=True)
+def test_doclangclassifier_run_route_cannot_detect_language(doclangclassifier, caplog):
+    doc_unidentifiable_lang = Document("01234, 56789, ")
+    with caplog.at_level(logging.INFO):
+        results, edge = doclangclassifier.run(documents=[doc_unidentifiable_lang])
+        assert "The model cannot detect the language of any of the documents." in caplog.text
+    assert edge == "output_1"
+    assert results["documents"][0].to_dict()["meta"]["language"] is None
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_run_route_fail_on_language_not_in_list(doclangclassifier, caplog):
+    doc_other_lang = Document("Meu nome é Matteo e moro em Roma")
+    with pytest.raises(ValueError, match="is not in the list of languages to route"):
+        doclangclassifier.run(documents=[doc_other_lang])
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("doclangclassifier", ["langdetect", "transformers"], indirect=True)
+def test_doclangclassifier_run_batch(doclangclassifier):
+    docs = [[doc] for doc in DOCUMENTS]
+    results, split_edge = doclangclassifier.run_batch(documents=docs)
+    assert split_edge == "split"
+    for edge, result in results.items():
+        document = result["documents"][0][0]
+        num_document = DOCUMENTS.index(document)
+        expected_language = EXPECTED_LANGUAGES[num_document]
+        assert edge == EXPECTED_OUTPUT_EDGES[num_document]
+        assert document.to_dict()["meta"]["language"] == expected_language