Docs: Update language classifier docstrings (#4413)

* Update language classifier docstrings

* Apply suggestions from code review

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
Agnieszka Marzec 2023-03-17 13:40:02 +02:00 committed by GitHub
parent f04b2f3cee
commit 26e0fbb4f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 50 deletions

View File

@ -12,7 +12,7 @@ DEFAULT_LANGUAGES = ["en", "de", "es", "cs", "nl"]
class BaseDocumentLanguageClassifier(BaseComponent):
"""
Abstract class for Document Language Classifiers
Abstract class for Document Language Classifiers.
"""
outgoing_edges = len(DEFAULT_LANGUAGES)
@ -27,8 +27,8 @@ class BaseDocumentLanguageClassifier(BaseComponent):
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
"""
:param route_by_language: whether to send Documents on a different output edge depending on their language.
:param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
:param route_by_language: Routes Documents to a different output edge depending on their language.
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect documentation](https://github.com/Mimino666/langdetect#languages)).
"""
super().__init__()
@ -36,13 +36,12 @@ class BaseDocumentLanguageClassifier(BaseComponent):
languages_to_route = DEFAULT_LANGUAGES
if route_by_language is True:
logger.info(
"languages_to_route list has not been defined. The default list will be used: %s",
languages_to_route,
"The languages_to_route list is not defined. The default list will be used: %s", languages_to_route
)
if len(set(languages_to_route)) != len(languages_to_route):
duplicates = {lang for lang in languages_to_route if languages_to_route.count(lang) > 1}
raise ValueError(f"languages_to_route parameter can't contain duplicate values ({duplicates}).")
raise ValueError(f"The languages_to_route parameter can't contain duplicate values ({duplicates}).")
self.route_by_language = route_by_language
self.languages_to_route = languages_to_route
@ -62,7 +61,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
"""
Run language document classifier on a list of documents.
:param documents: list of documents to detect language.
:param documents: A list of documents whose language you want to detect.
"""
docs_with_languages = self.predict(documents=documents)
output = {"documents": docs_with_languages}
@ -75,7 +74,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
unique_languages = list(set(languages))
if len(unique_languages) > 1:
raise ValueError(
f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
f"If the route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
"If you want to route documents by language, you can call Pipeline.run() once for each Document."
)
language = unique_languages[0]
@ -97,7 +96,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
"""
Run language document classifier on batches of documents.
:param documents: list of lists of documents to detect language.
:param documents: A list of lists of documents whose language you want to detect.
"""
docs_lists_with_languages = self.predict_batch(documents=documents, batch_size=batch_size)
@ -115,13 +114,13 @@ class BaseDocumentLanguageClassifier(BaseComponent):
unique_languages = list(set(languages))
if len(unique_languages) > 1:
raise ValueError(
f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
f"If the route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
"If you want to route documents by language, you can call Pipeline.run() once for each Document."
)
if unique_languages[0] is None:
logger.warning(
"The model cannot detect the language of some of the documents."
"The first language in the list of supported languages will be used to route the document: %s",
"The first language in the list of supported languages will be used to route the documents: %s",
self.languages_to_route[0],
)
language: Optional[str] = self.languages_to_route[0]
@ -129,7 +128,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
if language not in self.languages_to_route:
raise ValueError(
f"'{language}' is not in the list of languages to route ({', '.join(self.languages_to_route)})."
f"You should specify them when initializing the node, using the parameter languages_to_route."
f"Specify them when initializing the node, using the parameter languages_to_route."
)
edge_name = self._get_edge_from_language(str(language))

View File

@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
"""
Node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for document language classification.
A node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for classifying the language of documents.
This node detects the language of Documents and adds the output to the Documents metadata.
The meta field of the Document is a dictionary with the following format:
``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
- Using the document language classifier, you can directly get predictions via predict()
- You can flow the Documents to different branches depending on their language,
by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
- Using the document language classifier, you can directly get predictions with `predict()`.
- You can route the Documents to different branches depending on their language
by setting the `route_by_language` parameter to `True` and specifying the `languages_to_route` parameter.
**Usage example**
```python
...
@ -46,16 +46,16 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
"""
:param route_by_language: whether to send Documents on a different output edge depending on their language.
:param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
:param route_by_language: Sends Documents to a different output edge depending on their language.
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
"""
super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
"""
Detect the languge of Documents and add the output to the Documents metadata.
:param documents: list of Documents to detect language.
:return: List of Documents, where Document.meta["language"] contains the predicted language
Detect the language of Documents and add the output to the Documents metadata.
:param documents: A list of Documents whose language you want to detect.
:return: List of Documents, where Document.meta["language"] contains the predicted language.
"""
if len(documents) == 0:
raise ValueError(
@ -79,8 +79,8 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
"""
Detect the documents language and add the output to the document's meta data.
:param documents: list of lists of Documents to detect language.
Detect the Document's language and add the output to the Document's meta data.
:param documents: A list of lists of Documents to detect language.
:return: List of lists of Documents, where Document.meta["language"] contains the predicted language
"""
if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):

View File

@ -15,14 +15,13 @@ logger = logging.getLogger(__name__)
class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
"""
Transformer based model for document language classification using the HuggingFace's transformers framework
(https://github.com/huggingface/transformers).
Transformer-based model for classifying the document language using the Hugging Face's [transformers framework](https://github.com/huggingface/transformers).
While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
This node detects the language of Documents and adds the output to the Documents metadata.
The meta field of the Document is a dictionary with the following format:
``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
- Using the document language classifier, you can directly get predictions via predict()
- You can flow the Documents to different branches depending on their language,
- Using the document language classifier, you can directly get predictions with the `predict()` method.
- You can route the Documents to different branches depending on their language
by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
**Usage example**
```python
@ -66,28 +65,27 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
):
"""
Load a language detection model from Transformers.
See https://huggingface.co/models for full list of available models.
Language detection models: https://huggingface.co/models?search=language%20detection
For a full list of available models, see [Hugging Face models](https://huggingface.co/models).
For language detection models, see [Language Detection models](https://huggingface.co/models?search=language%20detection) on Hugging Face.
:param route_by_language: whether to send Documents on a different output edge depending on their language.
:param languages_to_route: list of languages, each corresponding to a different output edge (for the list of the supported languages, see the model card of the chosen model).
:param labels_to_languages_mapping: some Transformers models do not return language names but generic labels. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}.
:param route_by_language: Sends Documents to a different output edge depending on their language.
:param languages_to_route: A list of languages, each corresponding to a different output edge (for the list of supported languages, see the model card of the chosen model).
:param labels_to_languages_mapping: Some Transformers models return generic labels instead of language names. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}.
:param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'papluca/xlm-roberta-base-language-detection'.
See https://huggingface.co/models for full list of available models.
:param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
:param tokenizer: Name of the tokenizer (usually the same as model)
:param model_name_or_path: Directory of a saved model or the name of a public model, for example 'papluca/xlm-roberta-base-language-detection'.
See [Hugging Face models](https://huggingface.co/models) for a full list of available models.
:param model_version: The version of the model to use from the Hugging Face model hub. Can be a tag name, a branch name, or a commit hash.
:param tokenizer: Name of the tokenizer (usually the same as model).
:param use_gpu: Whether to use GPU (if available).
:param batch_size: Number of Documents to be processed at a time.
:param progress_bar: Whether to show a progress bar while processing.
:param use_auth_token: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
:param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
A list containing torch device objects and/or strings is supported (For example
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
:param use_auth_token: The API token used to download private models from Hugging Face.
If set to `True`, the token generated when running
`transformers-cli login` (stored in ~/.huggingface) is used.
For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained).
:param devices: List of torch devices (for example, cuda, cpu, mps) to limit inference to specific devices.
A list containing torch device objects or strings is supported (for example
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False`, the devices
parameter is not used and a single cpu device is used for inference.
"""
@ -118,10 +116,10 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
"""
Detect the languge of Documents and add the output to the Documents metadata.
:param documents: list of Documents to detect language.
Detect the language of Documents and add the output to the Documents metadata.
:param documents: A list of Documents whose language you want to detect.
:param batch_size: The number of Documents to classify at a time.
:return: List of Documents, where Document.meta["language"] contains the predicted language
:return: A list of Documents, where Document.meta["language"] contains the predicted language.
"""
if len(documents) == 0:
raise ValueError(
@ -148,9 +146,9 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
"""
Detect the documents language and add the output to the document's meta data.
:param documents: list of lists of Documents to detect language.
:return: List of lists of Documents, where Document.meta["language"] contains the predicted language
Detect the Document's language and add the output to the Document's meta data.
:param documents: A list of lists of Documents whose language you want to detect.
:return: A list of lists of Documents where Document.meta["language"] contains the predicted language.
"""
if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):
raise ValueError(