mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-02 10:49:30 +00:00
Docs: Update language classifier docstrings (#4413)
* Update language classifier docstrings * Apply suggestions from code review --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
f04b2f3cee
commit
26e0fbb4f8
@ -12,7 +12,7 @@ DEFAULT_LANGUAGES = ["en", "de", "es", "cs", "nl"]
|
||||
|
||||
class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
"""
|
||||
Abstract class for Document Language Classifiers
|
||||
Abstract class for Document Language Classifiers.
|
||||
"""
|
||||
|
||||
outgoing_edges = len(DEFAULT_LANGUAGES)
|
||||
@ -27,8 +27,8 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
|
||||
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
|
||||
"""
|
||||
:param route_by_language: whether to send Documents on a different output edge depending on their language.
|
||||
:param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
|
||||
:param route_by_language: Routes Documents to a different output edge depending on their language.
|
||||
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect documentation](https://github.com/Mimino666/langdetect#languages)).
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
@ -36,13 +36,12 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
languages_to_route = DEFAULT_LANGUAGES
|
||||
if route_by_language is True:
|
||||
logger.info(
|
||||
"languages_to_route list has not been defined. The default list will be used: %s",
|
||||
languages_to_route,
|
||||
"The languages_to_route list is not defined. The default list will be used: %s", languages_to_route
|
||||
)
|
||||
|
||||
if len(set(languages_to_route)) != len(languages_to_route):
|
||||
duplicates = {lang for lang in languages_to_route if languages_to_route.count(lang) > 1}
|
||||
raise ValueError(f"languages_to_route parameter can't contain duplicate values ({duplicates}).")
|
||||
raise ValueError(f"The languages_to_route parameter can't contain duplicate values ({duplicates}).")
|
||||
|
||||
self.route_by_language = route_by_language
|
||||
self.languages_to_route = languages_to_route
|
||||
@ -62,7 +61,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
"""
|
||||
Run language document classifier on a list of documents.
|
||||
|
||||
:param documents: list of documents to detect language.
|
||||
:param documents: A list of documents whose language you want to detect.
|
||||
"""
|
||||
docs_with_languages = self.predict(documents=documents)
|
||||
output = {"documents": docs_with_languages}
|
||||
@ -75,7 +74,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
unique_languages = list(set(languages))
|
||||
if len(unique_languages) > 1:
|
||||
raise ValueError(
|
||||
f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
|
||||
f"If the route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
|
||||
"If you want to route documents by language, you can call Pipeline.run() once for each Document."
|
||||
)
|
||||
language = unique_languages[0]
|
||||
@ -97,7 +96,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
"""
|
||||
Run language document classifier on batches of documents.
|
||||
|
||||
:param documents: list of lists of documents to detect language.
|
||||
:param documents: A list of lists of documents whose language you want to detect.
|
||||
"""
|
||||
docs_lists_with_languages = self.predict_batch(documents=documents, batch_size=batch_size)
|
||||
|
||||
@ -115,13 +114,13 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
unique_languages = list(set(languages))
|
||||
if len(unique_languages) > 1:
|
||||
raise ValueError(
|
||||
f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
|
||||
f"If the route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
|
||||
"If you want to route documents by language, you can call Pipeline.run() once for each Document."
|
||||
)
|
||||
if unique_languages[0] is None:
|
||||
logger.warning(
|
||||
"The model cannot detect the language of some of the documents."
|
||||
"The first language in the list of supported languages will be used to route the document: %s",
|
||||
"The first language in the list of supported languages will be used to route the documents: %s",
|
||||
self.languages_to_route[0],
|
||||
)
|
||||
language: Optional[str] = self.languages_to_route[0]
|
||||
@ -129,7 +128,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
|
||||
if language not in self.languages_to_route:
|
||||
raise ValueError(
|
||||
f"'{language}' is not in the list of languages to route ({', '.join(self.languages_to_route)})."
|
||||
f"You should specify them when initializing the node, using the parameter languages_to_route."
|
||||
f"Specify them when initializing the node, using the parameter languages_to_route."
|
||||
)
|
||||
|
||||
edge_name = self._get_edge_from_language(str(language))
|
||||
|
||||
@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
"""
|
||||
Node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for document language classification.
|
||||
A node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for classifying the language of documents.
|
||||
This node detects the language of Documents and adds the output to the Documents metadata.
|
||||
The meta field of the Document is a dictionary with the following format:
|
||||
``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
|
||||
- Using the document language classifier, you can directly get predictions via predict()
|
||||
- You can flow the Documents to different branches depending on their language,
|
||||
by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
|
||||
- Using the document language classifier, you can directly get predictions with `predict()`.
|
||||
- You can route the Documents to different branches depending on their language
|
||||
by setting the `route_by_language` parameter to `True` and specifying the `languages_to_route` parameter.
|
||||
**Usage example**
|
||||
```python
|
||||
...
|
||||
@ -46,16 +46,16 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
|
||||
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
|
||||
"""
|
||||
:param route_by_language: whether to send Documents on a different output edge depending on their language.
|
||||
:param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
|
||||
:param route_by_language: Sends Documents to a different output edge depending on their language.
|
||||
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
|
||||
"""
|
||||
super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
|
||||
|
||||
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
|
||||
"""
|
||||
Detect the languge of Documents and add the output to the Documents metadata.
|
||||
:param documents: list of Documents to detect language.
|
||||
:return: List of Documents, where Document.meta["language"] contains the predicted language
|
||||
Detect the language of Documents and add the output to the Documents metadata.
|
||||
:param documents: A list of Documents whose language you want to detect.
|
||||
:return: List of Documents, where Document.meta["language"] contains the predicted language.
|
||||
"""
|
||||
if len(documents) == 0:
|
||||
raise ValueError(
|
||||
@ -79,8 +79,8 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
|
||||
def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
|
||||
"""
|
||||
Detect the documents language and add the output to the document's meta data.
|
||||
:param documents: list of lists of Documents to detect language.
|
||||
Detect the Document's language and add the output to the Document's meta data.
|
||||
:param documents: A list of lists of Documents to detect language.
|
||||
:return: List of lists of Documents, where Document.meta["language"] contains the predicted language
|
||||
"""
|
||||
if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):
|
||||
|
||||
@ -15,14 +15,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
"""
|
||||
Transformer based model for document language classification using the HuggingFace's transformers framework
|
||||
(https://github.com/huggingface/transformers).
|
||||
Transformer-based model for classifying the document language using the Hugging Face's [transformers framework](https://github.com/huggingface/transformers).
|
||||
While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
|
||||
This node detects the language of Documents and adds the output to the Documents metadata.
|
||||
The meta field of the Document is a dictionary with the following format:
|
||||
``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
|
||||
- Using the document language classifier, you can directly get predictions via predict()
|
||||
- You can flow the Documents to different branches depending on their language,
|
||||
- Using the document language classifier, you can directly get predictions with the `predict()` method.
|
||||
- You can route the Documents to different branches depending on their language
|
||||
by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
|
||||
**Usage example**
|
||||
```python
|
||||
@ -66,28 +65,27 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
):
|
||||
"""
|
||||
Load a language detection model from Transformers.
|
||||
See https://huggingface.co/models for full list of available models.
|
||||
Language detection models: https://huggingface.co/models?search=language%20detection
|
||||
For a full list of available models, see [Hugging Face models](https://huggingface.co/models).
|
||||
For language detection models, see [Language Detection models](https://huggingface.co/models?search=language%20detection) on Hugging Face.
|
||||
|
||||
:param route_by_language: whether to send Documents on a different output edge depending on their language.
|
||||
:param languages_to_route: list of languages, each corresponding to a different output edge (for the list of the supported languages, see the model card of the chosen model).
|
||||
:param labels_to_languages_mapping: some Transformers models do not return language names but generic labels. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}.
|
||||
:param route_by_language: Sends Documents to a different output edge depending on their language.
|
||||
:param languages_to_route: A list of languages, each corresponding to a different output edge (for the list of supported languages, see the model card of the chosen model).
|
||||
:param labels_to_languages_mapping: Some Transformers models return generic labels instead of language names. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}.
|
||||
|
||||
:param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'papluca/xlm-roberta-base-language-detection'.
|
||||
See https://huggingface.co/models for full list of available models.
|
||||
:param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
|
||||
:param tokenizer: Name of the tokenizer (usually the same as model)
|
||||
:param model_name_or_path: Directory of a saved model or the name of a public model, for example 'papluca/xlm-roberta-base-language-detection'.
|
||||
See [Hugging Face models](https://huggingface.co/models) for a full list of available models.
|
||||
:param model_version: The version of the model to use from the Hugging Face model hub. Can be a tag name, a branch name, or a commit hash.
|
||||
:param tokenizer: Name of the tokenizer (usually the same as model).
|
||||
:param use_gpu: Whether to use GPU (if available).
|
||||
:param batch_size: Number of Documents to be processed at a time.
|
||||
:param progress_bar: Whether to show a progress bar while processing.
|
||||
:param use_auth_token: The API token used to download private models from Huggingface.
|
||||
If this parameter is set to `True`, then the token generated when running
|
||||
`transformers-cli login` (stored in ~/.huggingface) will be used.
|
||||
Additional information can be found here
|
||||
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
|
||||
:param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
|
||||
A list containing torch device objects and/or strings is supported (For example
|
||||
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
|
||||
:param use_auth_token: The API token used to download private models from Hugging Face.
|
||||
If set to `True`, the token generated when running
|
||||
`transformers-cli login` (stored in ~/.huggingface) is used.
|
||||
For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained).
|
||||
:param devices: List of torch devices (for example, cuda, cpu, mps) to limit inference to specific devices.
|
||||
A list containing torch device objects or strings is supported (for example
|
||||
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False`, the devices
|
||||
parameter is not used and a single cpu device is used for inference.
|
||||
|
||||
"""
|
||||
@ -118,10 +116,10 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
|
||||
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
|
||||
"""
|
||||
Detect the languge of Documents and add the output to the Documents metadata.
|
||||
:param documents: list of Documents to detect language.
|
||||
Detect the language of Documents and add the output to the Documents metadata.
|
||||
:param documents: A list of Documents whose language you want to detect.
|
||||
:param batch_size: The number of Documents to classify at a time.
|
||||
:return: List of Documents, where Document.meta["language"] contains the predicted language
|
||||
:return: A list of Documents, where Document.meta["language"] contains the predicted language.
|
||||
"""
|
||||
if len(documents) == 0:
|
||||
raise ValueError(
|
||||
@ -148,9 +146,9 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
|
||||
def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
|
||||
"""
|
||||
Detect the documents language and add the output to the document's meta data.
|
||||
:param documents: list of lists of Documents to detect language.
|
||||
:return: List of lists of Documents, where Document.meta["language"] contains the predicted language
|
||||
Detect the Document's language and add the output to the Document's meta data.
|
||||
:param documents: A list of lists of Documents whose language you want to detect.
|
||||
:return: A list of lists of Documents where Document.meta["language"] contains the predicted language.
|
||||
"""
|
||||
if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):
|
||||
raise ValueError(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user