add DocumentLanguageClassifier API (#4401)

2026-01-06 12:07:04 +00:00 · 2023-03-14 09:12:03 +01:00 · 2023-03-14 09:12:03 +01:00 · 7d17ca7391
commit 7d17ca7391
parent 98256ecf57
3 changed files with 28 additions and 2 deletions
--- a/docs/pydoc/config/doc-language-classifier.yml
+++ b/docs/pydoc/config/doc-language-classifier.yml
@ -0,0 +1,26 @@
+loaders:
+  - type: python
+    search_path: [../../../haystack/nodes/doc_language_classifier]
+    modules: ["base", "langdetect", "transformers"]
+    ignore_when_discovered: ["__init__"]
+processors:
+  - type: filter
+    expression:
+    documented_only: true
+    do_not_filter_modules: false
+    skip_empty_modules: true
+  - type: smart
+  - type: crossref
+renderer:
+  type: renderers.ReadmeRenderer
+  excerpt: Detects the language of the Documents
+  category_slug: haystack-classes
+  title: Document Language Classifier API
+  slug: doc-language-classifier-api
+  order: 25
+  markdown:
+    descriptive_class_title: false
+    descriptive_module_title: true
+    add_method_class_prefix: true
+    add_member_class_prefix: false
+    filename: doc_language_classifier_api.md
--- a/haystack/nodes/doc_language_classifier/langdetect.py
+++ b/haystack/nodes/doc_language_classifier/langdetect.py
@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
 class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
    """
    Node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for document language classification.
-    This node detects the languge of Documents and adds the output to the Documents metadata.
+    This node detects the language of Documents and adds the output to the Documents metadata.
    The meta field of the Document is a dictionary with the following format:
    ``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
    - Using the document language classifier, you can directly get predictions via predict()
--- a/haystack/nodes/doc_language_classifier/transformers.py
+++ b/haystack/nodes/doc_language_classifier/transformers.py
@ -18,7 +18,7 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
    Transformer based model for document language classification using the HuggingFace's transformers framework
    (https://github.com/huggingface/transformers).
    While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
-    This node detects the languge of Documents and adds the output to the Documents metadata.
+    This node detects the language of Documents and adds the output to the Documents metadata.
    The meta field of the Document is a dictionary with the following format:
    ``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
    - Using the document language classifier, you can directly get predictions via predict()